luminal_python: suppress torch.export _guards_fn instead of disabling auto-dynamic shapes

Replaces the WIP `automatic_dynamic_shapes = False` workaround (commit 3a3cd049) with a targeted monkey-patch of `torch.export._unlift. _ok_to_generate_guards_fn`. That function already supports a call-stack opt-out (used by executorch / modai / on_device_ai / torchao); we extend it with a "luminal" check so torch.export skips inserting the `_guards_fn` submodule whenever luminal is the embedder. Why the previous workaround was costly: with `automatic_dynamic_shapes = False`, the bench loop's `compiled(input_ids, cache_position=tensor([k]))` recompiles once per `cache_position` *value*, i.e. one full luminal compile per generated token. gemma3-4b smoke = ~2 hr CPU + 200 GB host RSS. The L NameError it was working around fires during aot_autograd's fx.Interpreter trace of a re-exported GraphModule that contains the L-referencing `_guards_fn` body — a dead-end for any non-dynamo consumer of the exported graph. Skipping `_guards_fn` generation at the source restores the compile-once-run-many behaviour of dynamic-shape promotion: dynamo promotes the varying dim to a SymInt on the second compile and reuses the same compiled graph for all subsequent values. The monkey-patch is scoped to luminal's call stack — other consumers of `torch.export` in the same Python process see unmodified behaviour. Verified via a multi-shape compile smoke (`compiled(rand(4,8))` then `compiled(rand(5,8))`): no L NameError. The remaining downstream `SymInt` input passthrough is handled by `_specialize_sym_scalar` in pt2.py and is unrelated to this fix. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
luminal_python: WIP workaround for dynamo "L not defined" on gemma3
2026-06-01 21:49:47 +09:00 · 2026-05-01 23:23:22 +00:00 · 2026-05-01 22:15:17 +00:00 · 2026-05-01 22:14:55 +00:00 · 2026-05-01 18:41:25 +00:00 · 2026-05-01 18:37:26 +00:00
37 changed files with 4999 additions and 235 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -37,3 +37,9 @@ __pycache__/
 dist/
 build/
 uv.lock
+
+# TTFT benchmark SQLite database (per-machine state)
+benchmarks/ttft/bench.db
+benchmarks/ttft/bench.db-journal
+benchmarks/ttft/bench.db-wal
+benchmarks/ttft/bench.db-shm
--- a/benchmarks/ttft/bench_python_baseline.py
+++ b/benchmarks/ttft/bench_python_baseline.py
@@ -0,0 +1,117 @@
+"""Pure HuggingFace/PyTorch TTFT + TPOT bench. Prints a JSON line on stdout.
+
+Measures:
+  TTFT — sum of single-token forward-pass durations over the prompt, using
+         a StaticCache. Methodology matches bench_python_luminal.py and the
+         rust path so the cross-path comparison is apples-to-apples.
+  TPOT — average time per output token during KV-cache greedy decode.
+"""
+
+import argparse
+import json
+import statistics
+import time
+
+import torch
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+from transformers.cache_utils import StaticCache
+
+from bench_utils import encode_prompt, measure_tpot, static_cache_config
+
+DEFAULT_MODEL = "NousResearch/Meta-Llama-3-8B-Instruct"
+DEFAULT_PROMPT = "Explain what a neural network is in a paragraph."
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--model", default=DEFAULT_MODEL)
+    ap.add_argument("--prompt", default=DEFAULT_PROMPT)
+    ap.add_argument("--warmups", type=int, default=1)
+    ap.add_argument("--iters", type=int, default=3)
+    ap.add_argument("--dtype", default="float32", choices=["float32", "bfloat16", "float16"])
+    ap.add_argument("--decode-tokens", type=int, default=50,
+                    help="Number of tokens to generate for TPOT measurement (0 = skip).")
+    ap.add_argument("--max-cache-len", type=int, default=256,
+                    help="StaticCache max sequence length.")
+    args = ap.parse_args()
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    dtype = {"float32": torch.float32, "bfloat16": torch.bfloat16, "float16": torch.float16}[args.dtype]
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    input_ids = encode_prompt(tokenizer, args.prompt, device)
+    prompt_tokens = int(input_ids.shape[-1])
+
+    config = AutoConfig.from_pretrained(args.model)
+    config._attn_implementation = "eager"
+
+    model = (
+        AutoModelForCausalLM.from_pretrained(args.model, config=config, torch_dtype=dtype)
+        .eval()
+        .to(device)
+    )
+
+    single_token = torch.zeros(1, 1, dtype=torch.long, device=device)
+
+    cache_config = static_cache_config(config)
+
+    def make_cache():
+        return StaticCache(
+            config=cache_config,
+            max_batch_size=1,
+            max_cache_len=args.max_cache_len,
+            device=device,
+            dtype=dtype,
+        )
+
+    def measure_ttft() -> float:
+        """Sum of per-token forward-pass durations over prompt_tokens steps."""
+        kv = make_cache()
+        # Eager init at position 0 to satisfy StaticCache.lazy_initialization.
+        with torch.no_grad():
+            model(single_token, past_key_values=kv,
+                  cache_position=torch.tensor([0], device=device))
+        total_ms = 0.0
+        for pos in range(1, prompt_tokens):
+            if device.type == "cuda":
+                torch.cuda.synchronize()
+            t0 = time.perf_counter()
+            with torch.no_grad():
+                model(single_token, past_key_values=kv,
+                      cache_position=torch.tensor([pos], device=device))
+                if device.type == "cuda":
+                    torch.cuda.synchronize()
+            total_ms += (time.perf_counter() - t0) * 1000.0
+        return total_ms
+
+    for _ in range(args.warmups):
+        measure_ttft()
+
+    ttft_samples_ms = [measure_ttft() for _ in range(args.iters)]
+
+    result = {
+        "path": "python_baseline",
+        "model": args.model,
+        "device": str(device),
+        "dtype": args.dtype,
+        "prompt_tokens": prompt_tokens,
+        "iters": args.iters,
+        "ttft_ms": statistics.median(ttft_samples_ms),
+        "ttft_ms_mean": sum(ttft_samples_ms) / len(ttft_samples_ms),
+        "ttft_ms_samples": ttft_samples_ms,
+        "note": "sequential per-token, StaticCache KV cache",
+    }
+
+    if args.decode_tokens > 0:
+        tpot_samples_ms = measure_tpot(model, input_ids, device, args.decode_tokens)
+        tpot_ms = sum(tpot_samples_ms) / len(tpot_samples_ms)
+        result["decode_tokens"] = args.decode_tokens
+        result["tpot_ms"] = tpot_ms
+        result["tpot_ms_samples"] = tpot_samples_ms
+        result["throughput_tps"] = 1000.0 / tpot_ms
+
+    print("BENCH_RESULT " + json.dumps(result))
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks/ttft/bench_python_luminal.py
+++ b/benchmarks/ttft/bench_python_luminal.py
@@ -0,0 +1,196 @@
+"""Python -> Luminal TTFT + TPOT bench via torch.compile(backend=luminal_backend).
+
+Methodology mirrors examples/llama (the Rust path):
+  - One eager prefill step initialises the StaticCache (required by transformers'
+    StaticCache.lazy_initialization) before compilation.
+  - TTFT: run one forward pass per prompt token sequentially, each advancing
+    cache_position by 1; sum durations.
+  - TPOT: run --decode-tokens more single-token passes; average durations.
+  - StaticCache pre-allocates K/V buffers up to max_cache_len; no growing allocation.
+
+Prints a BENCH_RESULT JSON line on stdout.
+"""
+
+import argparse
+import gc
+import json
+import statistics
+import time
+
+import torch
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+from transformers.cache_utils import StaticCache
+
+from bench_utils import encode_prompt, static_cache_config
+from luminal import luminal_backend
+
+DEFAULT_MODEL = "NousResearch/Meta-Llama-3-8B-Instruct"
+DEFAULT_PROMPT = "Explain what a neural network is in a paragraph."
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--model", default=DEFAULT_MODEL)
+    ap.add_argument("--prompt", default=DEFAULT_PROMPT)
+    ap.add_argument("--warmups", type=int, default=1)
+    ap.add_argument("--iters", type=int, default=3)
+    ap.add_argument(
+        "--search-iters",
+        type=int,
+        default=500,
+        help="Egraph search iterations (matches examples/llama default of 500).",
+    )
+    ap.add_argument(
+        "--decode-tokens",
+        type=int,
+        default=50,
+        help="Tokens to generate for TPOT measurement (0 = skip TPOT).",
+    )
+    ap.add_argument(
+        "--max-cache-len",
+        type=int,
+        default=256,
+        help="StaticCache max sequence length.",
+    )
+    ap.add_argument(
+        "--dtype",
+        default="float32",
+        choices=["float32", "bfloat16", "float16"],
+        help="Torch dtype for model + StaticCache.",
+    )
+    args = ap.parse_args()
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    dtype = {"float32": torch.float32, "bfloat16": torch.bfloat16, "float16": torch.float16}[args.dtype]
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    input_ids = encode_prompt(tokenizer, args.prompt, device)
+    prompt_tokens = int(input_ids.shape[-1])
+
+    config = AutoConfig.from_pretrained(args.model)
+    config._attn_implementation = "eager"
+
+    model = (
+        AutoModelForCausalLM.from_pretrained(args.model, config=config, torch_dtype=dtype)
+        .eval()
+        .to(device)
+    )
+
+    single_token = torch.zeros(1, 1, dtype=torch.long, device=device)
+
+    cache_config = static_cache_config(config)
+
+    def make_cache():
+        return StaticCache(
+            config=cache_config,
+            max_batch_size=1,
+            max_cache_len=args.max_cache_len,
+            device=device,
+            dtype=dtype,
+        )
+
+    # Step 0: run ONE eager prefill to initialise the cache tensors and call
+    # mark_static_address (required by transformers' StaticCache before compile).
+    cache = make_cache()
+    with torch.no_grad():
+        model(single_token, past_key_values=cache, cache_position=torch.tensor([0], device=device))
+
+    # Compile for a single-token input — same graph is reused for every step.
+    # Compilation happens on the first call after the eager init above.
+    t0 = time.perf_counter()
+    compiled = torch.compile(
+        model,
+        backend=luminal_backend,
+        options={"search_iterations": args.search_iters},
+    )
+    cache_position = torch.tensor([1], dtype=torch.long, device=device)
+    with torch.no_grad():
+        compiled(single_token, past_key_values=cache, cache_position=cache_position)
+        if device.type == "cuda":
+            torch.cuda.synchronize()
+    compile_ms = (time.perf_counter() - t0) * 1000.0
+
+    gc.collect()
+    if device.type == "cuda":
+        torch.cuda.empty_cache()
+
+    def one_step(pos: int, kv_cache):
+        cache_pos = torch.tensor([pos], dtype=torch.long, device=device)
+        with torch.no_grad():
+            compiled(single_token, past_key_values=kv_cache, cache_position=cache_pos)
+            if device.type == "cuda":
+                torch.cuda.synchronize()
+
+    def measure_ttft():
+        """Sum of per-token forward-pass durations over prompt_tokens steps.
+
+        Uses a fresh cache so each TTFT measurement is independent.
+        """
+        kv = make_cache()
+        # Eager init for this fresh cache (required before compiled can run on it).
+        with torch.no_grad():
+            model(single_token, past_key_values=kv, cache_position=torch.tensor([0], device=device))
+        total_ms = 0.0
+        # Step 0 was the eager init above; measure from step 1 to prompt_tokens.
+        for pos in range(1, prompt_tokens):
+            if device.type == "cuda":
+                torch.cuda.synchronize()
+            t0 = time.perf_counter()
+            one_step(pos, kv)
+            total_ms += (time.perf_counter() - t0) * 1000.0
+        return total_ms
+
+    def measure_tpot(n, start_pos: int):
+        """Average single-token forward-pass duration over n decode steps."""
+        kv = make_cache()
+        # Eager init
+        with torch.no_grad():
+            model(single_token, past_key_values=kv, cache_position=torch.tensor([0], device=device))
+        # One warmup step.
+        one_step(1, kv)
+        step_times_ms = []
+        for i in range(n):
+            pos = start_pos + i
+            if device.type == "cuda":
+                torch.cuda.synchronize()
+            t0 = time.perf_counter()
+            one_step(pos, kv)
+            step_times_ms.append((time.perf_counter() - t0) * 1000.0)
+        return step_times_ms
+
+    # Warmups before timing TTFT (all run after compilation is complete).
+    for _ in range(args.warmups):
+        measure_ttft()
+
+    ttft_samples_ms = [measure_ttft() for _ in range(args.iters)]
+
+    tpot_ms_samples = []
+    if args.decode_tokens > 0:
+        tpot_ms_samples = measure_tpot(args.decode_tokens, start_pos=prompt_tokens)
+
+    tpot_ms = sum(tpot_ms_samples) / len(tpot_ms_samples) if tpot_ms_samples else None
+    throughput_tps = (1000.0 / tpot_ms) if tpot_ms else None
+
+    result = {
+        "path": "python_luminal",
+        "model": args.model,
+        "device": str(device),
+        "dtype": args.dtype,
+        "prompt_tokens": prompt_tokens,
+        "iters": args.iters,
+        "ttft_ms": statistics.median(ttft_samples_ms),
+        "ttft_ms_mean": sum(ttft_samples_ms) / len(ttft_samples_ms),
+        "ttft_ms_samples": ttft_samples_ms,
+        "compile_ms": compile_ms,
+        "search_iters": args.search_iters,
+        "decode_tokens": args.decode_tokens if args.decode_tokens > 0 else None,
+        "tpot_ms": tpot_ms,
+        "tpot_ms_samples": tpot_ms_samples,
+        "throughput_tps": throughput_tps,
+        "note": "sequential per-token, StaticCache KV cache",
+    }
+    print("BENCH_RESULT " + json.dumps(result))
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks/ttft/bench_python_torch_compile.py
+++ b/benchmarks/ttft/bench_python_torch_compile.py
@@ -0,0 +1,138 @@
+"""Vanilla torch.compile TTFT + TPOT bench. Prints a JSON line on stdout.
+
+Uses the default inductor backend (torch.compile without a custom backend).
+TTFT uses sequential per-token prefill with a StaticCache so the methodology
+matches bench_python_baseline.py, bench_python_luminal.py, and the rust path.
+"""
+
+import argparse
+import json
+import statistics
+import time
+
+import torch
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+from transformers.cache_utils import StaticCache
+
+from bench_utils import encode_prompt, measure_tpot, static_cache_config
+
+DEFAULT_MODEL = "NousResearch/Meta-Llama-3-8B-Instruct"
+DEFAULT_PROMPT = "Explain what a neural network is in a paragraph."
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--model", default=DEFAULT_MODEL)
+    ap.add_argument("--prompt", default=DEFAULT_PROMPT)
+    ap.add_argument("--warmups", type=int, default=1)
+    ap.add_argument("--iters", type=int, default=3)
+    ap.add_argument("--dtype", default="float32", choices=["float32", "bfloat16", "float16"])
+    ap.add_argument(
+        "--decode-tokens", type=int, default=50,
+        help="Number of tokens to generate for TPOT measurement (0 = skip).",
+    )
+    ap.add_argument("--max-cache-len", type=int, default=256,
+                    help="StaticCache max sequence length.")
+    args = ap.parse_args()
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    dtype = {"float32": torch.float32, "bfloat16": torch.bfloat16, "float16": torch.float16}[args.dtype]
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    input_ids = encode_prompt(tokenizer, args.prompt, device)
+    prompt_tokens = int(input_ids.shape[-1])
+
+    config = AutoConfig.from_pretrained(args.model)
+    config._attn_implementation = "eager"
+
+    model = (
+        AutoModelForCausalLM.from_pretrained(args.model, config=config, torch_dtype=dtype)
+        .eval()
+        .to(device)
+    )
+
+    single_token = torch.zeros(1, 1, dtype=torch.long, device=device)
+
+    cache_config = static_cache_config(config)
+
+    def make_cache():
+        return StaticCache(
+            config=cache_config,
+            max_batch_size=1,
+            max_cache_len=args.max_cache_len,
+            device=device,
+            dtype=dtype,
+        )
+
+    # Eager init on the uncompiled model so the StaticCache buffers get
+    # registered (mark_static_address) before torch.compile traces them.
+    init_cache = make_cache()
+    with torch.no_grad():
+        model(single_token, past_key_values=init_cache,
+              cache_position=torch.tensor([0], device=device))
+
+    compiled = torch.compile(model)
+
+    # First compiled call triggers JIT compilation; time it as compile_ms.
+    if device.type == "cuda":
+        torch.cuda.synchronize()
+    t0 = time.perf_counter()
+    with torch.no_grad():
+        compiled(single_token, past_key_values=init_cache,
+                 cache_position=torch.tensor([1], device=device))
+        if device.type == "cuda":
+            torch.cuda.synchronize()
+    compile_ms = (time.perf_counter() - t0) * 1000.0
+
+    def measure_ttft() -> float:
+        """Sum of per-token compiled-forward durations over prompt_tokens steps."""
+        kv = make_cache()
+        # Fresh cache needs eager init via the uncompiled model first.
+        with torch.no_grad():
+            model(single_token, past_key_values=kv,
+                  cache_position=torch.tensor([0], device=device))
+        total_ms = 0.0
+        for pos in range(1, prompt_tokens):
+            if device.type == "cuda":
+                torch.cuda.synchronize()
+            t0 = time.perf_counter()
+            with torch.no_grad():
+                compiled(single_token, past_key_values=kv,
+                         cache_position=torch.tensor([pos], device=device))
+                if device.type == "cuda":
+                    torch.cuda.synchronize()
+            total_ms += (time.perf_counter() - t0) * 1000.0
+        return total_ms
+
+    for _ in range(args.warmups):
+        measure_ttft()
+
+    ttft_samples_ms = [measure_ttft() for _ in range(args.iters)]
+
+    result = {
+        "path": "python_torch_compile",
+        "model": args.model,
+        "device": str(device),
+        "dtype": args.dtype,
+        "prompt_tokens": prompt_tokens,
+        "iters": args.iters,
+        "ttft_ms": statistics.median(ttft_samples_ms),
+        "ttft_ms_mean": sum(ttft_samples_ms) / len(ttft_samples_ms),
+        "ttft_ms_samples": ttft_samples_ms,
+        "compile_ms": compile_ms,
+        "note": "sequential per-token, StaticCache KV cache (torch.compile inductor)",
+    }
+
+    if args.decode_tokens > 0:
+        tpot_samples_ms = measure_tpot(compiled, input_ids, device, args.decode_tokens)
+        tpot_ms = sum(tpot_samples_ms) / len(tpot_samples_ms)
+        result["decode_tokens"] = args.decode_tokens
+        result["tpot_ms"] = tpot_ms
+        result["tpot_ms_samples"] = tpot_samples_ms
+        result["throughput_tps"] = 1000.0 / tpot_ms
+
+    print("BENCH_RESULT " + json.dumps(result))
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks/ttft/bench_utils.py
+++ b/benchmarks/ttft/bench_utils.py
@@ -0,0 +1,94 @@
+"""Shared helpers for the Python benchmark scripts."""
+
+import time
+
+import torch
+
+
+class _CfgWithoutKvShared:
+    """Wrapper that hides `num_kv_shared_layers` from a HF config.
+
+    transformers 5.6 has a bug in StaticCache.__init__:
+        if hasattr(config, "num_kv_shared_layers"):
+            layer_types = layer_types[: -config.num_kv_shared_layers]
+    For configs where the attribute is 0 (e.g. Gemma-4), `[:-0]` returns an
+    empty list, leaving StaticCache with zero layer slots, and the LM's
+    first `past_key_values.update(..., layer_idx=0)` raises IndexError.
+
+    This wrapper makes `hasattr(...)` return False so the bad branch never
+    fires. Used via `static_cache_config(config)` below.
+    """
+    __slots__ = ("_inner",)
+
+    def __init__(self, inner):
+        object.__setattr__(self, "_inner", inner)
+
+    def __getattr__(self, name):
+        if name == "num_kv_shared_layers":
+            raise AttributeError(name)
+        return getattr(self._inner, name)
+
+    def get_text_config(self, *args, **kwargs):
+        return _CfgWithoutKvShared(self._inner.get_text_config(*args, **kwargs))
+
+
+def static_cache_config(config):
+    """Return a config suitable for `StaticCache(config=..., ...)`.
+
+    Two normalizations:
+      1. Multimodal wrappers (Gemma4ForConditionalGeneration, ...) nest the
+         actual LM config under `.text_config`. Pass that, not the wrapper,
+         so layer/head counts match the inner LM.
+      2. If the resulting config has `num_kv_shared_layers == 0`, wrap it to
+         hide the attribute (works around the transformers 5.6 slice bug).
+    """
+    cfg = getattr(config, "text_config", config)
+    if getattr(cfg, "num_kv_shared_layers", None) == 0:
+        cfg = _CfgWithoutKvShared(cfg)
+    return cfg
+
+
+def encode_prompt(tokenizer, prompt: str, device):
+    """Tokenize prompt using chat template if available, falling back to raw tokenization."""
+    messages = [{"role": "user", "content": prompt}]
+    try:
+        encoded = tokenizer.apply_chat_template(
+            messages, add_generation_prompt=True, return_tensors="pt"
+        )
+    except (ValueError, AttributeError):
+        encoded = tokenizer(prompt, return_tensors="pt")
+    if hasattr(encoded, "input_ids"):
+        return encoded.input_ids.to(device)
+    if isinstance(encoded, dict):
+        return encoded["input_ids"].to(device)
+    return encoded.to(device)
+
+
+def measure_tpot(model, input_ids, device, decode_tokens: int) -> list[float]:
+    """Prefill once with KV cache, then time each subsequent single-token decode step."""
+    with torch.no_grad():
+        out = model(input_ids, use_cache=True)
+        if device.type == "cuda":
+            torch.cuda.synchronize()
+        past = out.past_key_values
+        next_id = out.logits[:, -1:].argmax(-1)
+
+        out = model(next_id, past_key_values=past, use_cache=True)
+        if device.type == "cuda":
+            torch.cuda.synchronize()
+        past = out.past_key_values
+        next_id = out.logits[:, -1:].argmax(-1)
+
+        step_times_ms = []
+        for _ in range(decode_tokens):
+            if device.type == "cuda":
+                torch.cuda.synchronize()
+            t0 = time.perf_counter()
+            out = model(next_id, past_key_values=past, use_cache=True)
+            if device.type == "cuda":
+                torch.cuda.synchronize()
+            step_times_ms.append((time.perf_counter() - t0) * 1000.0)
+            past = out.past_key_values
+            next_id = out.logits[:, -1:].argmax(-1)
+
+    return step_times_ms
--- a/benchmarks/ttft/benchmarks.toml
+++ b/benchmarks/ttft/benchmarks.toml
@@ -0,0 +1,92 @@
+[ur_test]
+models = ["llama-8b", "qwen3-4b", "gemma3-4b", "gemma4-moe", "qwen3-moe"]
+# 3-point sweep (low/mid/high). The previous list [5, 10, 20, 50, 100, 500]
+# spent ~62 extra minutes on s=5/s=20/s=50 with little additional information.
+search_sweep_iters = [10, 100, 500]
+
+[configs.llama-8b]
+model = "NousResearch/Meta-Llama-3-8B-Instruct"
+rust_package = "llama"
+search_iters = 500
+iters = 10
+warmups = 2
+decode_tokens = 50
+# On-disk weights are bf16-majority. fp32 upcast doubled python_luminal's
+# egglog Search peak past the 525 GB unified pool and triggered SIGKILLs on
+# gemma3-4b (and same risk here). bf16 matches rust's load path.
+dtype = "bfloat16"
+
+[configs.as_fast_as_possible]
+prompt = "The"
+search_iters = 1
+iters = 1
+warmups = 0
+decode_tokens = 5
+
+[configs.qwen3-4b]
+model = "Qwen/Qwen3-4B"
+rust_package = "qwen"
+search_iters = 50
+iters = 10
+warmups = 2
+decode_tokens = 20
+# bf16-majority on-disk; see llama-8b note.
+dtype = "bfloat16"
+
+[configs.gemma3-4b]
+model = "unsloth/gemma-3-4b-it"
+rust_package = "gemma"
+search_iters = 50
+iters = 10
+warmups = 2
+decode_tokens = 20
+# bf16-majority on-disk; see llama-8b note.
+dtype = "bfloat16"
+
+[configs.gemma4-moe]
+model = "google/gemma-4-26B-A4B"
+rust_package = "gemma4_moe"
+search_iters = 50
+iters = 10
+warmups = 2
+decode_tokens = 20
+# 26B params at fp32 = 104 GB → OOM on a 94 GB GPU. Use bf16 (matches the
+# on-disk safetensors dtype) so the python paths can actually load.
+dtype = "bfloat16"
+
+[configs.qwen3-moe]
+model = "Qwen/Qwen3-30B-A3B"
+rust_package = "qwen3_moe"
+search_iters = 50
+iters = 10
+warmups = 2
+decode_tokens = 20
+# 30B params at fp32 = 120 GB → OOM. See gemma4-moe note.
+dtype = "bfloat16"
+
+[configs.llama-8b-const]
+model = "NousResearch/Meta-Llama-3-8B-Instruct"
+rust_package = "llama"
+prompt = "We the People of the United States, in Order to form a more perfect Union, establish Justice, insure domestic Tranquility, provide for the common defence, promote the general Welfare, and secure the Blessings of Liberty to ourselves and our Posterity, do ordain and establish this Constitution for the United States of America."
+search_iters = 500
+iters = 10
+warmups = 2
+decode_tokens = 20
+
+[configs.qwen3-4b-const]
+model = "Qwen/Qwen3-4B"
+rust_package = "qwen"
+prompt = "We the People of the United States, in Order to form a more perfect Union, establish Justice, insure domestic Tranquility, provide for the common defence, promote the general Welfare, and secure the Blessings of Liberty to ourselves and our Posterity, do ordain and establish this Constitution for the United States of America."
+search_iters = 50
+iters = 10
+warmups = 2
+decode_tokens = 20
+
+[configs.gemma3-4b-const]
+model = "unsloth/gemma-3-4b-it"
+rust_package = "gemma"
+prompt = "We the People of the United States, in Order to form a more perfect Union, establish Justice, insure domestic Tranquility, provide for the common defence, promote the general Welfare, and secure the Blessings of Liberty to ourselves and our Posterity, do ordain and establish this Constitution for the United States of America."
+search_iters = 50
+iters = 10
+warmups = 2
+decode_tokens = 20
--- a/benchmarks/ttft/dashboard.html
+++ b/benchmarks/ttft/dashboard.html
@@ -0,0 +1,610 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<title>Luminal · Benchmark Dashboard</title>
+<link rel="preconnect" href="https://fonts.googleapis.com">
+<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+<link href="https://fonts.googleapis.com/css2?family=Geist:wght@300;400;500;600&family=Geist+Mono:wght@300;400;500&display=swap" rel="stylesheet">
+<script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
+<style>
+*, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
+html { -webkit-font-smoothing: antialiased; scroll-behavior: smooth; }
+
+body {
+  font-family: 'Geist', system-ui, sans-serif;
+  background: #030712;
+  color: #d7d8d9;
+  min-height: 100vh;
+  line-height: 1.5;
+}
+
+/* ── NAV ── */
+nav {
+  position: sticky;
+  top: 0;
+  z-index: 50;
+  height: 56px;
+  background: rgba(8, 15, 17, 0.92);
+  backdrop-filter: blur(8px);
+  -webkit-backdrop-filter: blur(8px);
+  border-bottom: 1px solid #2d3335;
+  display: flex;
+  align-items: center;
+  padding: 0 24px;
+  gap: 0;
+}
+.nav-brand {
+  display: flex;
+  align-items: center;
+  gap: 8px;
+  font-family: 'Geist Mono', monospace;
+  font-size: 14px;
+  font-weight: 500;
+  letter-spacing: 0.05em;
+  color: #2faa6e;
+  text-decoration: none;
+}
+.nav-dot {
+  width: 6px;
+  height: 6px;
+  background: #2faa6e;
+  border-radius: 50%;
+  flex-shrink: 0;
+  animation: pulse-glow 2s ease-in-out infinite;
+}
+.nav-sep {
+  color: #2d3335;
+  margin: 0 14px;
+  font-size: 18px;
+  font-weight: 300;
+}
+.nav-page {
+  font-family: 'Geist Mono', monospace;
+  font-size: 11px;
+  letter-spacing: 0.1em;
+  text-transform: uppercase;
+  color: #7e8385;
+}
+
+@keyframes pulse-glow {
+  0%, 100% { opacity: 1; }
+  50%       { opacity: 0.35; }
+}
+
+/* ── MAIN ── */
+main {
+  max-width: 1200px;
+  margin: 0 auto;
+  padding: 40px 24px 80px;
+}
+
+/* ── PAGE HEADER ── */
+.page-header {
+  margin-bottom: 40px;
+  padding-bottom: 32px;
+  border-bottom: 1px solid #1c2225;
+}
+.page-eyebrow {
+  font-family: 'Geist Mono', monospace;
+  font-size: 11px;
+  letter-spacing: 0.1em;
+  text-transform: uppercase;
+  color: #2faa6e;
+  margin-bottom: 10px;
+}
+.page-title {
+  font-size: 30px;
+  font-weight: 500;
+  letter-spacing: -0.025em;
+  color: #d7d8d9;
+  margin-bottom: 10px;
+}
+.page-meta {
+  font-size: 14px;
+  color: #7e8385;
+  display: flex;
+  align-items: center;
+  gap: 0;
+  flex-wrap: wrap;
+}
+.meta-sep {
+  font-family: 'Geist Mono', monospace;
+  color: #2d3335;
+  margin: 0 10px;
+}
+.meta-val {
+  font-family: 'Geist Mono', monospace;
+  font-size: 13px;
+  color: #5b5f61;
+}
+
+/* ── LEGEND STRIP ── */
+.legend-strip {
+  display: flex;
+  flex-wrap: wrap;
+  gap: 6px;
+  margin-bottom: 32px;
+}
+.legend-pill {
+  display: flex;
+  align-items: center;
+  gap: 6px;
+  font-family: 'Geist Mono', monospace;
+  font-size: 11px;
+  letter-spacing: 0.04em;
+  color: #a1a4a5;
+  background: #141b1d;
+  border: 1px solid #2d3335;
+  border-radius: 2px;
+  padding: 4px 10px;
+}
+.legend-swatch {
+  width: 8px;
+  height: 8px;
+  border-radius: 50%;
+  flex-shrink: 0;
+}
+
+/* ── SECTIONS ── */
+section { margin-bottom: 48px; }
+.section-header {
+  display: flex;
+  align-items: baseline;
+  gap: 10px;
+  margin-bottom: 16px;
+  padding-bottom: 12px;
+  border-bottom: 1px solid #1c2225;
+  flex-wrap: wrap;
+}
+.section-eyebrow {
+  font-family: 'Geist Mono', monospace;
+  font-size: 11px;
+  letter-spacing: 0.1em;
+  text-transform: uppercase;
+  color: #404647;
+}
+.section-title {
+  font-size: 18px;
+  font-weight: 500;
+  color: #d7d8d9;
+  letter-spacing: -0.01em;
+}
+.section-title .unit {
+  color: #7e8385;
+  font-weight: 400;
+}
+.section-tag {
+  font-family: 'Geist Mono', monospace;
+  font-size: 11px;
+  letter-spacing: 0.04em;
+  text-transform: uppercase;
+  color: #2faa6e;
+  background: #162322;
+  border: 1px solid #1c372e;
+  padding: 2px 8px;
+  border-radius: 2px;
+  margin-left: auto;
+}
+
+/* ── CHART GRID ── */
+.chart-grid {
+  display: grid;
+  gap: 10px;
+}
+.chart-card {
+  background: #141b1d;
+  border: 1px solid #2d3335;
+  border-radius: 2px;
+  overflow: hidden;
+  transition: border-color 150ms;
+  min-width: 0;
+}
+.chart-card:hover { border-color: #404647; }
+.chart-card-header {
+  padding: 10px 14px 0;
+  display: flex;
+  align-items: center;
+}
+.model-tag {
+  font-family: 'Geist Mono', monospace;
+  font-size: 11px;
+  letter-spacing: 0.06em;
+  text-transform: uppercase;
+  color: #7e8385;
+}
+
+/* ── FOOTER ── */
+footer {
+  max-width: 1200px;
+  margin: 0 auto;
+  padding: 20px 24px;
+  border-top: 1px solid #1c2225;
+  font-family: 'Geist Mono', monospace;
+  font-size: 11px;
+  letter-spacing: 0.04em;
+  color: #404647;
+  display: flex;
+  justify-content: space-between;
+  flex-wrap: wrap;
+  gap: 8px;
+}
+
+.section-divider {
+  border: none;
+  border-top: 1px solid #1c2225;
+  margin: 8px 0 40px;
+}
+.sweep-hint {
+  font-family: 'Geist Mono', monospace;
+  font-size: 11px;
+  letter-spacing: 0.04em;
+  color: #404647;
+  margin-bottom: 12px;
+}
+
+@media (max-width: 768px) {
+  .chart-grid { grid-template-columns: 1fr !important; }
+  .page-title { font-size: 22px; }
+}
+</style>
+</head>
+<body>
+
+<nav>
+  <a class="nav-brand" href="https://luminal.com">
+    <span class="nav-dot"></span>luminal
+  </a>
+  <span class="nav-sep">/</span>
+  <span class="nav-page">benchmarks</span>
+</nav>
+
+<main>
+
+<header class="page-header">
+  <p class="page-eyebrow">performance · time-series</p>
+  <h1 class="page-title">Benchmark Dashboard</h1>
+  <div class="page-meta">
+    <span>Last updated</span>
+    <span class="meta-sep">·</span>
+    <span class="meta-val">May 01, 2026 · 18:56</span>
+    <span class="meta-sep">·</span>
+    <span class="meta-val">1 run in history</span>
+  </div>
+</header>
+
+<div class="legend-strip">
+  <div class="legend-pill"><span class="legend-swatch" style="background:#5b5f61"></span>HF Baseline</div><div class="legend-pill"><span class="legend-swatch" style="background:#3b82f6"></span>torch.compile</div><div class="legend-pill"><span class="legend-swatch" style="background:#a855f7"></span>luminal backend</div><div class="legend-pill"><span class="legend-swatch" style="background:#e8855a"></span>Rust (luminal)</div>
+</div>
+
+
+<section>
+  <div class="section-header">
+    <span class="section-eyebrow">metric</span>
+    <h2 class="section-title">TTFT <span class="unit">over time</span></h2>
+    <span class="section-tag">Time to first token (ms)</span>
+  </div>
+  <div class="chart-grid" style="grid-template-columns: repeat(4, 1fr)">
+<div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">llama-8b</span>
+  </div>
+  <div id="c_ttft_ms_llama_8b"></div>
+  <script>
+    Plotly.newPlot("c_ttft_ms_llama_8b", [{"x": ["2026-05-01T18-56-26-996695"], "y": [705.9654394979589], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "HF Baseline", "line": {"color": "#5b5f61", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": true, "hovertemplate": "<b>HF Baseline</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [307.66548847896047], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "torch.compile", "line": {"color": "#3b82f6", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": true, "hovertemplate": "<b>torch.compile</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [461.48114453535527], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "luminal backend", "line": {"color": "#a855f7", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": true, "hovertemplate": "<b>luminal backend</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [1026.86], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "line": {"color": "#e8855a", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": true, "hovertemplate": "<b>Rust (luminal)</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}], {"plot_bgcolor": "#0d1416", "paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9"}, "margin": {"t": 16, "b": 48, "l": 52, "r": 12}, "height": 280, "xaxis": {"type": "category", "categoryorder": "array", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "tickangle": -30, "automargin": true, "zeroline": false, "categoryarray": ["2026-05-01T18-56-26-996695"], "tickvals": ["2026-05-01T18-56-26-996695"], "ticktext": ["May 01 \u00b7 18:56"]}, "yaxis": {"rangemode": "tozero", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "ticksuffix": " ms", "zeroline": false}, "legend": {"orientation": "h", "y": -0.28, "x": 0, "font": {"size": 11, "color": "#a1a4a5"}, "bgcolor": "rgba(0,0,0,0)"}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}},
+      {responsive: true, displayModeBar: false});
+  </script>
+</div><div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">qwen3-4b</span>
+  </div>
+  <div id="c_ttft_ms_qwen3_4b"></div>
+  <script>
+    Plotly.newPlot("c_ttft_ms_qwen3_4b", [{"x": ["2026-05-01T18-56-26-996695"], "y": [869.2860195587855], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "HF Baseline", "line": {"color": "#5b5f61", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>HF Baseline</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [298.27259748708457], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "torch.compile", "line": {"color": "#3b82f6", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>torch.compile</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [485.3892414830625], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "luminal backend", "line": {"color": "#a855f7", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>luminal backend</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [398.58], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "line": {"color": "#e8855a", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>Rust (luminal)</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}], {"plot_bgcolor": "#0d1416", "paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9"}, "margin": {"t": 16, "b": 16, "l": 52, "r": 12}, "height": 280, "xaxis": {"type": "category", "categoryorder": "array", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "tickangle": -30, "automargin": true, "zeroline": false, "categoryarray": ["2026-05-01T18-56-26-996695"], "tickvals": ["2026-05-01T18-56-26-996695"], "ticktext": ["May 01 \u00b7 18:56"]}, "yaxis": {"rangemode": "tozero", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "ticksuffix": " ms", "zeroline": false}, "legend": {"orientation": "h", "y": -0.28, "x": 0, "font": {"size": 11, "color": "#a1a4a5"}, "bgcolor": "rgba(0,0,0,0)", "visible": false}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}},
+      {responsive: true, displayModeBar: false});
+  </script>
+</div><div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">gemma3-4b</span>
+  </div>
+  <div id="c_ttft_ms_gemma3_4b"></div>
+  <script>
+    Plotly.newPlot("c_ttft_ms_gemma3_4b", [{"x": ["2026-05-01T18-56-26-996695"], "y": [951.1196144158021], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "HF Baseline", "line": {"color": "#5b5f61", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>HF Baseline</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [300.9451600664761], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "torch.compile", "line": {"color": "#3b82f6", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>torch.compile</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [404.43], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "line": {"color": "#e8855a", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>Rust (luminal)</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}], {"plot_bgcolor": "#0d1416", "paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9"}, "margin": {"t": 16, "b": 16, "l": 52, "r": 12}, "height": 280, "xaxis": {"type": "category", "categoryorder": "array", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "tickangle": -30, "automargin": true, "zeroline": false, "categoryarray": ["2026-05-01T18-56-26-996695"], "tickvals": ["2026-05-01T18-56-26-996695"], "ticktext": ["May 01 \u00b7 18:56"]}, "yaxis": {"rangemode": "tozero", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "ticksuffix": " ms", "zeroline": false}, "legend": {"orientation": "h", "y": -0.28, "x": 0, "font": {"size": 11, "color": "#a1a4a5"}, "bgcolor": "rgba(0,0,0,0)", "visible": false}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}},
+      {responsive: true, displayModeBar: false});
+  </script>
+</div><div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">gemma4-moe</span>
+  </div>
+  <div id="c_ttft_ms_gemma4_moe"></div>
+  <script>
+    Plotly.newPlot("c_ttft_ms_gemma4_moe", [{"x": ["2026-05-01T18-56-26-996695"], "y": [837.3980740143452], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "HF Baseline", "line": {"color": "#5b5f61", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>HF Baseline</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [245.510076492792], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "torch.compile", "line": {"color": "#3b82f6", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>torch.compile</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}], {"plot_bgcolor": "#0d1416", "paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9"}, "margin": {"t": 16, "b": 16, "l": 52, "r": 12}, "height": 280, "xaxis": {"type": "category", "categoryorder": "array", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "tickangle": -30, "automargin": true, "zeroline": false, "categoryarray": ["2026-05-01T18-56-26-996695"], "tickvals": ["2026-05-01T18-56-26-996695"], "ticktext": ["May 01 \u00b7 18:56"]}, "yaxis": {"rangemode": "tozero", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "ticksuffix": " ms", "zeroline": false}, "legend": {"orientation": "h", "y": -0.28, "x": 0, "font": {"size": 11, "color": "#a1a4a5"}, "bgcolor": "rgba(0,0,0,0)", "visible": false}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}},
+      {responsive: true, displayModeBar: false});
+  </script>
+</div><div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">qwen3-moe</span>
+  </div>
+  <div id="c_ttft_ms_qwen3_moe"></div>
+  <script>
+    Plotly.newPlot("c_ttft_ms_qwen3_moe", [{"x": ["2026-05-01T18-56-26-996695"], "y": [1565.540504961973], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "HF Baseline", "line": {"color": "#5b5f61", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>HF Baseline</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [460.077923577046], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "torch.compile", "line": {"color": "#3b82f6", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>torch.compile</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [21002.791983017232], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "luminal backend", "line": {"color": "#a855f7", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>luminal backend</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [662.07], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "line": {"color": "#e8855a", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>Rust (luminal)</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}], {"plot_bgcolor": "#0d1416", "paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9"}, "margin": {"t": 16, "b": 16, "l": 52, "r": 12}, "height": 280, "xaxis": {"type": "category", "categoryorder": "array", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "tickangle": -30, "automargin": true, "zeroline": false, "categoryarray": ["2026-05-01T18-56-26-996695"], "tickvals": ["2026-05-01T18-56-26-996695"], "ticktext": ["May 01 \u00b7 18:56"]}, "yaxis": {"rangemode": "tozero", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "ticksuffix": " ms", "zeroline": false}, "legend": {"orientation": "h", "y": -0.28, "x": 0, "font": {"size": 11, "color": "#a1a4a5"}, "bgcolor": "rgba(0,0,0,0)", "visible": false}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}},
+      {responsive: true, displayModeBar: false});
+  </script>
+</div>
+  </div>
+</section>
+<section>
+  <div class="section-header">
+    <span class="section-eyebrow">metric</span>
+    <h2 class="section-title">TPOT <span class="unit">over time</span></h2>
+    <span class="section-tag">Time per output token (ms)</span>
+  </div>
+  <div class="chart-grid" style="grid-template-columns: repeat(4, 1fr)">
+<div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">llama-8b</span>
+  </div>
+  <div id="c_tpot_ms_llama_8b"></div>
+  <script>
+    Plotly.newPlot("c_tpot_ms_llama_8b", [{"x": ["2026-05-01T18-56-26-996695"], "y": [34.15271903970279], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "HF Baseline", "line": {"color": "#5b5f61", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": true, "hovertemplate": "<b>HF Baseline</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [171.7862353892997], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "torch.compile", "line": {"color": "#3b82f6", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": true, "hovertemplate": "<b>torch.compile</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [23.078908618772402], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "luminal backend", "line": {"color": "#a855f7", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": true, "hovertemplate": "<b>luminal backend</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [51.64], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "line": {"color": "#e8855a", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": true, "hovertemplate": "<b>Rust (luminal)</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}], {"plot_bgcolor": "#0d1416", "paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9"}, "margin": {"t": 16, "b": 48, "l": 52, "r": 12}, "height": 280, "xaxis": {"type": "category", "categoryorder": "array", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "tickangle": -30, "automargin": true, "zeroline": false, "categoryarray": ["2026-05-01T18-56-26-996695"], "tickvals": ["2026-05-01T18-56-26-996695"], "ticktext": ["May 01 \u00b7 18:56"]}, "yaxis": {"rangemode": "tozero", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "ticksuffix": " ms", "zeroline": false}, "legend": {"orientation": "h", "y": -0.28, "x": 0, "font": {"size": 11, "color": "#a1a4a5"}, "bgcolor": "rgba(0,0,0,0)"}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}},
+      {responsive: true, displayModeBar: false});
+  </script>
+</div><div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">qwen3-4b</span>
+  </div>
+  <div id="c_tpot_ms_qwen3_4b"></div>
+  <script>
+    Plotly.newPlot("c_tpot_ms_qwen3_4b", [{"x": ["2026-05-01T18-56-26-996695"], "y": [47.71483448566869], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "HF Baseline", "line": {"color": "#5b5f61", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>HF Baseline</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [468.56868775503244], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "torch.compile", "line": {"color": "#3b82f6", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>torch.compile</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [26.90318431414198], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "luminal backend", "line": {"color": "#a855f7", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>luminal backend</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [40.62], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "line": {"color": "#e8855a", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>Rust (luminal)</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}], {"plot_bgcolor": "#0d1416", "paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9"}, "margin": {"t": 16, "b": 16, "l": 52, "r": 12}, "height": 280, "xaxis": {"type": "category", "categoryorder": "array", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "tickangle": -30, "automargin": true, "zeroline": false, "categoryarray": ["2026-05-01T18-56-26-996695"], "tickvals": ["2026-05-01T18-56-26-996695"], "ticktext": ["May 01 \u00b7 18:56"]}, "yaxis": {"rangemode": "tozero", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "ticksuffix": " ms", "zeroline": false}, "legend": {"orientation": "h", "y": -0.28, "x": 0, "font": {"size": 11, "color": "#a1a4a5"}, "bgcolor": "rgba(0,0,0,0)", "visible": false}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}},
+      {responsive: true, displayModeBar: false});
+  </script>
+</div><div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">gemma3-4b</span>
+  </div>
+  <div id="c_tpot_ms_gemma3_4b"></div>
+  <script>
+    Plotly.newPlot("c_tpot_ms_gemma3_4b", [{"x": ["2026-05-01T18-56-26-996695"], "y": [52.498737201676704], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "HF Baseline", "line": {"color": "#5b5f61", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>HF Baseline</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [2197.426627812092], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "torch.compile", "line": {"color": "#3b82f6", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>torch.compile</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [38.99], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "line": {"color": "#e8855a", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>Rust (luminal)</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}], {"plot_bgcolor": "#0d1416", "paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9"}, "margin": {"t": 16, "b": 16, "l": 52, "r": 12}, "height": 280, "xaxis": {"type": "category", "categoryorder": "array", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "tickangle": -30, "automargin": true, "zeroline": false, "categoryarray": ["2026-05-01T18-56-26-996695"], "tickvals": ["2026-05-01T18-56-26-996695"], "ticktext": ["May 01 \u00b7 18:56"]}, "yaxis": {"rangemode": "tozero", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "ticksuffix": " ms", "zeroline": false}, "legend": {"orientation": "h", "y": -0.28, "x": 0, "font": {"size": 11, "color": "#a1a4a5"}, "bgcolor": "rgba(0,0,0,0)", "visible": false}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}},
+      {responsive: true, displayModeBar: false});
+  </script>
+</div><div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">gemma4-moe</span>
+  </div>
+  <div id="c_tpot_ms_gemma4_moe"></div>
+  <script>
+    Plotly.newPlot("c_tpot_ms_gemma4_moe", [{"x": ["2026-05-01T18-56-26-996695"], "y": [83.64427039632574], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "HF Baseline", "line": {"color": "#5b5f61", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>HF Baseline</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [654.9649795080768], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "torch.compile", "line": {"color": "#3b82f6", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>torch.compile</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}], {"plot_bgcolor": "#0d1416", "paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9"}, "margin": {"t": 16, "b": 16, "l": 52, "r": 12}, "height": 280, "xaxis": {"type": "category", "categoryorder": "array", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "tickangle": -30, "automargin": true, "zeroline": false, "categoryarray": ["2026-05-01T18-56-26-996695"], "tickvals": ["2026-05-01T18-56-26-996695"], "ticktext": ["May 01 \u00b7 18:56"]}, "yaxis": {"rangemode": "tozero", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "ticksuffix": " ms", "zeroline": false}, "legend": {"orientation": "h", "y": -0.28, "x": 0, "font": {"size": 11, "color": "#a1a4a5"}, "bgcolor": "rgba(0,0,0,0)", "visible": false}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}},
+      {responsive: true, displayModeBar: false});
+  </script>
+</div><div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">qwen3-moe</span>
+  </div>
+  <div id="c_tpot_ms_qwen3_moe"></div>
+  <script>
+    Plotly.newPlot("c_tpot_ms_qwen3_moe", [{"x": ["2026-05-01T18-56-26-996695"], "y": [84.527321747737], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "HF Baseline", "line": {"color": "#5b5f61", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>HF Baseline</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [753.0061075551203], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "torch.compile", "line": {"color": "#3b82f6", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>torch.compile</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [1166.8824461026816], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "luminal backend", "line": {"color": "#a855f7", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>luminal backend</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [60.08], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "line": {"color": "#e8855a", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>Rust (luminal)</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}], {"plot_bgcolor": "#0d1416", "paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9"}, "margin": {"t": 16, "b": 16, "l": 52, "r": 12}, "height": 280, "xaxis": {"type": "category", "categoryorder": "array", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "tickangle": -30, "automargin": true, "zeroline": false, "categoryarray": ["2026-05-01T18-56-26-996695"], "tickvals": ["2026-05-01T18-56-26-996695"], "ticktext": ["May 01 \u00b7 18:56"]}, "yaxis": {"rangemode": "tozero", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "ticksuffix": " ms", "zeroline": false}, "legend": {"orientation": "h", "y": -0.28, "x": 0, "font": {"size": 11, "color": "#a1a4a5"}, "bgcolor": "rgba(0,0,0,0)", "visible": false}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}},
+      {responsive: true, displayModeBar: false});
+  </script>
+</div>
+  </div>
+</section>
+<section>
+  <div class="section-header">
+    <span class="section-eyebrow">metric</span>
+    <h2 class="section-title">Time to Search <span class="unit">over time</span></h2>
+    <span class="section-tag">Search time (sec)</span>
+  </div>
+  <div class="chart-grid" style="grid-template-columns: repeat(4, 1fr)">
+<div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">llama-8b</span>
+  </div>
+  <div id="c_compile_ms_llama_8b"></div>
+  <script>
+    Plotly.newPlot("c_compile_ms_llama_8b", [{"x": ["2026-05-01T18-56-26-996695"], "y": [18.760145067994017], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "torch.compile", "line": {"color": "#3b82f6", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": true, "hovertemplate": "<b>torch.compile</b><br>%{customdata[1]}<br>%{y:.1f} sec<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [95.96263545705006], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "luminal backend", "line": {"color": "#a855f7", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": true, "hovertemplate": "<b>luminal backend</b><br>%{customdata[1]}<br>%{y:.1f} sec<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [84.45343], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "line": {"color": "#e8855a", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": true, "hovertemplate": "<b>Rust (luminal)</b><br>%{customdata[1]}<br>%{y:.1f} sec<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}], {"plot_bgcolor": "#0d1416", "paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9"}, "margin": {"t": 16, "b": 48, "l": 52, "r": 12}, "height": 280, "xaxis": {"type": "category", "categoryorder": "array", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "tickangle": -30, "automargin": true, "zeroline": false, "categoryarray": ["2026-05-01T18-56-26-996695"], "tickvals": ["2026-05-01T18-56-26-996695"], "ticktext": ["May 01 \u00b7 18:56"]}, "yaxis": {"rangemode": "tozero", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "ticksuffix": " sec", "zeroline": false}, "legend": {"orientation": "h", "y": -0.28, "x": 0, "font": {"size": 11, "color": "#a1a4a5"}, "bgcolor": "rgba(0,0,0,0)"}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}},
+      {responsive: true, displayModeBar: false});
+  </script>
+</div><div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">qwen3-4b</span>
+  </div>
+  <div id="c_compile_ms_qwen3_4b"></div>
+  <script>
+    Plotly.newPlot("c_compile_ms_qwen3_4b", [{"x": ["2026-05-01T18-56-26-996695"], "y": [4.680963660997804], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "torch.compile", "line": {"color": "#3b82f6", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>torch.compile</b><br>%{customdata[1]}<br>%{y:.1f} sec<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [45.345814052037895], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "luminal backend", "line": {"color": "#a855f7", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>luminal backend</b><br>%{customdata[1]}<br>%{y:.1f} sec<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [19.92977], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "line": {"color": "#e8855a", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>Rust (luminal)</b><br>%{customdata[1]}<br>%{y:.1f} sec<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}], {"plot_bgcolor": "#0d1416", "paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9"}, "margin": {"t": 16, "b": 16, "l": 52, "r": 12}, "height": 280, "xaxis": {"type": "category", "categoryorder": "array", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "tickangle": -30, "automargin": true, "zeroline": false, "categoryarray": ["2026-05-01T18-56-26-996695"], "tickvals": ["2026-05-01T18-56-26-996695"], "ticktext": ["May 01 \u00b7 18:56"]}, "yaxis": {"rangemode": "tozero", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "ticksuffix": " sec", "zeroline": false}, "legend": {"orientation": "h", "y": -0.28, "x": 0, "font": {"size": 11, "color": "#a1a4a5"}, "bgcolor": "rgba(0,0,0,0)", "visible": false}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}},
+      {responsive: true, displayModeBar: false});
+  </script>
+</div><div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">gemma3-4b</span>
+  </div>
+  <div id="c_compile_ms_gemma3_4b"></div>
+  <script>
+    Plotly.newPlot("c_compile_ms_gemma3_4b", [{"x": ["2026-05-01T18-56-26-996695"], "y": [26.649526304972824], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "torch.compile", "line": {"color": "#3b82f6", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>torch.compile</b><br>%{customdata[1]}<br>%{y:.1f} sec<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [156.84164], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "line": {"color": "#e8855a", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>Rust (luminal)</b><br>%{customdata[1]}<br>%{y:.1f} sec<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}], {"plot_bgcolor": "#0d1416", "paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9"}, "margin": {"t": 16, "b": 16, "l": 52, "r": 12}, "height": 280, "xaxis": {"type": "category", "categoryorder": "array", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "tickangle": -30, "automargin": true, "zeroline": false, "categoryarray": ["2026-05-01T18-56-26-996695"], "tickvals": ["2026-05-01T18-56-26-996695"], "ticktext": ["May 01 \u00b7 18:56"]}, "yaxis": {"rangemode": "tozero", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "ticksuffix": " sec", "zeroline": false}, "legend": {"orientation": "h", "y": -0.28, "x": 0, "font": {"size": 11, "color": "#a1a4a5"}, "bgcolor": "rgba(0,0,0,0)", "visible": false}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}},
+      {responsive: true, displayModeBar: false});
+  </script>
+</div><div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">gemma4-moe</span>
+  </div>
+  <div id="c_compile_ms_gemma4_moe"></div>
+  <script>
+    Plotly.newPlot("c_compile_ms_gemma4_moe", [{"x": ["2026-05-01T18-56-26-996695"], "y": [38.81582092499593], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "torch.compile", "line": {"color": "#3b82f6", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>torch.compile</b><br>%{customdata[1]}<br>%{y:.1f} sec<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}], {"plot_bgcolor": "#0d1416", "paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9"}, "margin": {"t": 16, "b": 16, "l": 52, "r": 12}, "height": 280, "xaxis": {"type": "category", "categoryorder": "array", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "tickangle": -30, "automargin": true, "zeroline": false, "categoryarray": ["2026-05-01T18-56-26-996695"], "tickvals": ["2026-05-01T18-56-26-996695"], "ticktext": ["May 01 \u00b7 18:56"]}, "yaxis": {"rangemode": "tozero", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "ticksuffix": " sec", "zeroline": false}, "legend": {"orientation": "h", "y": -0.28, "x": 0, "font": {"size": 11, "color": "#a1a4a5"}, "bgcolor": "rgba(0,0,0,0)", "visible": false}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}},
+      {responsive: true, displayModeBar: false});
+  </script>
+</div><div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">qwen3-moe</span>
+  </div>
+  <div id="c_compile_ms_qwen3_moe"></div>
+  <script>
+    Plotly.newPlot("c_compile_ms_qwen3_moe", [{"x": ["2026-05-01T18-56-26-996695"], "y": [8.341281775035895], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "torch.compile", "line": {"color": "#3b82f6", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>torch.compile</b><br>%{customdata[1]}<br>%{y:.1f} sec<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [111.70731823903043], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "luminal backend", "line": {"color": "#a855f7", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>luminal backend</b><br>%{customdata[1]}<br>%{y:.1f} sec<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [80.83241000000001], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "line": {"color": "#e8855a", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>Rust (luminal)</b><br>%{customdata[1]}<br>%{y:.1f} sec<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}], {"plot_bgcolor": "#0d1416", "paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9"}, "margin": {"t": 16, "b": 16, "l": 52, "r": 12}, "height": 280, "xaxis": {"type": "category", "categoryorder": "array", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "tickangle": -30, "automargin": true, "zeroline": false, "categoryarray": ["2026-05-01T18-56-26-996695"], "tickvals": ["2026-05-01T18-56-26-996695"], "ticktext": ["May 01 \u00b7 18:56"]}, "yaxis": {"rangemode": "tozero", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "ticksuffix": " sec", "zeroline": false}, "legend": {"orientation": "h", "y": -0.28, "x": 0, "font": {"size": 11, "color": "#a1a4a5"}, "bgcolor": "rgba(0,0,0,0)", "visible": false}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}},
+      {responsive: true, displayModeBar: false});
+  </script>
+</div>
+  </div>
+</section>
+<hr class='section-divider'>
+<section>
+  <div class="section-header">
+    <span class="section-eyebrow">sweep · 3d</span>
+    <h2 class="section-title">TTFT <span class="unit">vs search budget · over time</span></h2>
+    <span class="section-tag">1 run</span>
+  </div>
+  <p class="sweep-hint">Drag to rotate · scroll to zoom · each curve = one run</p>
+  <div class="chart-grid" style="grid-template-columns: repeat(4, 1fr)">
+<div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">llama-8b</span>
+  </div>
+  <div id="sw_ttft_ms_llama_8b"></div>
+  <script>
+    Plotly.newPlot("sw_ttft_ms_llama_8b", [{"type": "scatter3d", "mode": "lines+markers", "x": [10, 100, 500], "y": ["May 01", "May 01", "May 01"], "z": [470.7036415056791, 460.72837291285396, 472.43661794345826], "name": "luminal backend", "legendgroup": "python_luminal", "showlegend": true, "line": {"color": "#a855f7", "width": 5}, "marker": {"color": "#a855f7", "size": 4}, "hovertemplate": "<b>luminal backend</b><br>s=%{x} iters<br>%{z:.1f} ms<br>May 01 \u00b7 b2bd91f5<extra></extra>"}, {"type": "scatter3d", "mode": "lines+markers", "x": [10, 100, 500], "y": ["May 01", "May 01", "May 01"], "z": [751.03, 1038.34, 453.16], "name": "Rust (luminal)", "legendgroup": "rust", "showlegend": true, "line": {"color": "#e8855a", "width": 5}, "marker": {"color": "#e8855a", "size": 4}, "hovertemplate": "<b>Rust (luminal)</b><br>s=%{x} iters<br>%{z:.1f} ms<br>May 01 \u00b7 b2bd91f5<extra></extra>"}], {"paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9", "size": 11}, "height": 420, "margin": {"t": 20, "b": 0, "l": 0, "r": 0}, "legend": {"orientation": "h", "y": -0.05, "x": 0, "font": {"size": 11, "color": "#a1a4a5", "family": "Geist Mono, monospace"}, "bgcolor": "rgba(0,0,0,0)"}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}, "scene": {"bgcolor": "#0d1416", "xaxis": {"title": {"text": "search iters", "font": {"size": 10, "color": "#7e8385"}}, "type": "log", "tickvals": [5, 10, 20, 50, 100, 500], "ticktext": ["5", "10", "20", "50", "100", "500"], "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335", "zerolinecolor": "#2d3335"}, "yaxis": {"title": {"text": "run", "font": {"size": 10, "color": "#7e8385"}}, "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "zaxis": {"title": {"text": "ms", "font": {"size": 10, "color": "#7e8385"}}, "rangemode": "tozero", "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "ticksuffix": " ms", "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "camera": {"eye": {"x": 1.6, "y": -1.6, "z": 0.9}}}},
+      {responsive: true, displayModeBar: true, displaylogo: false,
+        modeBarButtonsToRemove: ["toImage","sendDataToCloud"]});
+  </script>
+</div><div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">qwen3-4b</span>
+  </div>
+  <div id="sw_ttft_ms_qwen3_4b"></div>
+  <script>
+    Plotly.newPlot("sw_ttft_ms_qwen3_4b", [{"type": "scatter3d", "mode": "lines+markers", "x": [10, 100, 500], "y": ["May 01", "May 01", "May 01"], "z": [465.02652901108377, 465.9317950136028, 495.75577257201076], "name": "luminal backend", "legendgroup": "python_luminal", "showlegend": true, "line": {"color": "#a855f7", "width": 5}, "marker": {"color": "#a855f7", "size": 4}, "hovertemplate": "<b>luminal backend</b><br>s=%{x} iters<br>%{z:.1f} ms<br>May 01 \u00b7 b2bd91f5<extra></extra>"}, {"type": "scatter3d", "mode": "lines+markers", "x": [10, 100, 500], "y": ["May 01", "May 01", "May 01"], "z": [398.44, 390.08, 559.29], "name": "Rust (luminal)", "legendgroup": "rust", "showlegend": true, "line": {"color": "#e8855a", "width": 5}, "marker": {"color": "#e8855a", "size": 4}, "hovertemplate": "<b>Rust (luminal)</b><br>s=%{x} iters<br>%{z:.1f} ms<br>May 01 \u00b7 b2bd91f5<extra></extra>"}], {"paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9", "size": 11}, "height": 420, "margin": {"t": 20, "b": 0, "l": 0, "r": 0}, "legend": {"orientation": "h", "y": -0.05, "x": 0, "font": {"size": 11, "color": "#a1a4a5", "family": "Geist Mono, monospace"}, "bgcolor": "rgba(0,0,0,0)"}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}, "scene": {"bgcolor": "#0d1416", "xaxis": {"title": {"text": "search iters", "font": {"size": 10, "color": "#7e8385"}}, "type": "log", "tickvals": [5, 10, 20, 50, 100, 500], "ticktext": ["5", "10", "20", "50", "100", "500"], "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335", "zerolinecolor": "#2d3335"}, "yaxis": {"title": {"text": "run", "font": {"size": 10, "color": "#7e8385"}}, "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "zaxis": {"title": {"text": "ms", "font": {"size": 10, "color": "#7e8385"}}, "rangemode": "tozero", "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "ticksuffix": " ms", "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "camera": {"eye": {"x": 1.6, "y": -1.6, "z": 0.9}}}},
+      {responsive: true, displayModeBar: true, displaylogo: false,
+        modeBarButtonsToRemove: ["toImage","sendDataToCloud"]});
+  </script>
+</div><div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">gemma3-4b</span>
+  </div>
+  <div id="sw_ttft_ms_gemma3_4b"></div>
+  <script>
+    Plotly.newPlot("sw_ttft_ms_gemma3_4b", [{"type": "scatter3d", "mode": "lines+markers", "x": [10, 100, 500], "y": ["May 01", "May 01", "May 01"], "z": [388.19, 436.49, 386.13], "name": "Rust (luminal)", "legendgroup": "rust", "showlegend": true, "line": {"color": "#e8855a", "width": 5}, "marker": {"color": "#e8855a", "size": 4}, "hovertemplate": "<b>Rust (luminal)</b><br>s=%{x} iters<br>%{z:.1f} ms<br>May 01 \u00b7 b2bd91f5<extra></extra>"}], {"paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9", "size": 11}, "height": 420, "margin": {"t": 20, "b": 0, "l": 0, "r": 0}, "legend": {"orientation": "h", "y": -0.05, "x": 0, "font": {"size": 11, "color": "#a1a4a5", "family": "Geist Mono, monospace"}, "bgcolor": "rgba(0,0,0,0)"}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}, "scene": {"bgcolor": "#0d1416", "xaxis": {"title": {"text": "search iters", "font": {"size": 10, "color": "#7e8385"}}, "type": "log", "tickvals": [5, 10, 20, 50, 100, 500], "ticktext": ["5", "10", "20", "50", "100", "500"], "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335", "zerolinecolor": "#2d3335"}, "yaxis": {"title": {"text": "run", "font": {"size": 10, "color": "#7e8385"}}, "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "zaxis": {"title": {"text": "ms", "font": {"size": 10, "color": "#7e8385"}}, "rangemode": "tozero", "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "ticksuffix": " ms", "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "camera": {"eye": {"x": 1.6, "y": -1.6, "z": 0.9}}}},
+      {responsive: true, displayModeBar: true, displaylogo: false,
+        modeBarButtonsToRemove: ["toImage","sendDataToCloud"]});
+  </script>
+</div><div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">qwen3-moe</span>
+  </div>
+  <div id="sw_ttft_ms_qwen3_moe"></div>
+  <script>
+    Plotly.newPlot("sw_ttft_ms_qwen3_moe", [{"type": "scatter3d", "mode": "lines+markers", "x": [10, 100, 500], "y": ["May 01", "May 01", "May 01"], "z": [21002.663500519702, 21018.686580006033, 21034.366824431345], "name": "luminal backend", "legendgroup": "python_luminal", "showlegend": true, "line": {"color": "#a855f7", "width": 5}, "marker": {"color": "#a855f7", "size": 4}, "hovertemplate": "<b>luminal backend</b><br>s=%{x} iters<br>%{z:.1f} ms<br>May 01 \u00b7 b2bd91f5<extra></extra>"}, {"type": "scatter3d", "mode": "lines+markers", "x": [10, 100, 500], "y": ["May 01", "May 01", "May 01"], "z": [656.7, 540.37, 542.34], "name": "Rust (luminal)", "legendgroup": "rust", "showlegend": true, "line": {"color": "#e8855a", "width": 5}, "marker": {"color": "#e8855a", "size": 4}, "hovertemplate": "<b>Rust (luminal)</b><br>s=%{x} iters<br>%{z:.1f} ms<br>May 01 \u00b7 b2bd91f5<extra></extra>"}], {"paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9", "size": 11}, "height": 420, "margin": {"t": 20, "b": 0, "l": 0, "r": 0}, "legend": {"orientation": "h", "y": -0.05, "x": 0, "font": {"size": 11, "color": "#a1a4a5", "family": "Geist Mono, monospace"}, "bgcolor": "rgba(0,0,0,0)"}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}, "scene": {"bgcolor": "#0d1416", "xaxis": {"title": {"text": "search iters", "font": {"size": 10, "color": "#7e8385"}}, "type": "log", "tickvals": [5, 10, 20, 50, 100, 500], "ticktext": ["5", "10", "20", "50", "100", "500"], "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335", "zerolinecolor": "#2d3335"}, "yaxis": {"title": {"text": "run", "font": {"size": 10, "color": "#7e8385"}}, "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "zaxis": {"title": {"text": "ms", "font": {"size": 10, "color": "#7e8385"}}, "rangemode": "tozero", "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "ticksuffix": " ms", "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "camera": {"eye": {"x": 1.6, "y": -1.6, "z": 0.9}}}},
+      {responsive: true, displayModeBar: true, displaylogo: false,
+        modeBarButtonsToRemove: ["toImage","sendDataToCloud"]});
+  </script>
+</div>
+  </div>
+</section>
+<section>
+  <div class="section-header">
+    <span class="section-eyebrow">sweep · 3d</span>
+    <h2 class="section-title">TPOT <span class="unit">vs search budget · over time</span></h2>
+    <span class="section-tag">1 run</span>
+  </div>
+  <p class="sweep-hint">Drag to rotate · scroll to zoom · each curve = one run</p>
+  <div class="chart-grid" style="grid-template-columns: repeat(4, 1fr)">
+<div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">llama-8b</span>
+  </div>
+  <div id="sw_tpot_ms_llama_8b"></div>
+  <script>
+    Plotly.newPlot("sw_tpot_ms_llama_8b", [{"type": "scatter3d", "mode": "lines+markers", "x": [10, 100, 500], "y": ["May 01", "May 01", "May 01"], "z": [23.540849717101082, 23.101884137140587, 23.610779400914907], "name": "luminal backend", "legendgroup": "python_luminal", "showlegend": true, "line": {"color": "#a855f7", "width": 5}, "marker": {"color": "#a855f7", "size": 4}, "hovertemplate": "<b>luminal backend</b><br>s=%{x} iters<br>%{z:.1f} ms<br>May 01 \u00b7 b2bd91f5<extra></extra>"}, {"type": "scatter3d", "mode": "lines+markers", "x": [10, 100, 500], "y": ["May 01", "May 01", "May 01"], "z": [38.2, 51.92, 24.09], "name": "Rust (luminal)", "legendgroup": "rust", "showlegend": true, "line": {"color": "#e8855a", "width": 5}, "marker": {"color": "#e8855a", "size": 4}, "hovertemplate": "<b>Rust (luminal)</b><br>s=%{x} iters<br>%{z:.1f} ms<br>May 01 \u00b7 b2bd91f5<extra></extra>"}], {"paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9", "size": 11}, "height": 420, "margin": {"t": 20, "b": 0, "l": 0, "r": 0}, "legend": {"orientation": "h", "y": -0.05, "x": 0, "font": {"size": 11, "color": "#a1a4a5", "family": "Geist Mono, monospace"}, "bgcolor": "rgba(0,0,0,0)"}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}, "scene": {"bgcolor": "#0d1416", "xaxis": {"title": {"text": "search iters", "font": {"size": 10, "color": "#7e8385"}}, "type": "log", "tickvals": [5, 10, 20, 50, 100, 500], "ticktext": ["5", "10", "20", "50", "100", "500"], "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335", "zerolinecolor": "#2d3335"}, "yaxis": {"title": {"text": "run", "font": {"size": 10, "color": "#7e8385"}}, "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "zaxis": {"title": {"text": "ms", "font": {"size": 10, "color": "#7e8385"}}, "rangemode": "tozero", "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "ticksuffix": " ms", "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "camera": {"eye": {"x": 1.6, "y": -1.6, "z": 0.9}}}},
+      {responsive: true, displayModeBar: true, displaylogo: false,
+        modeBarButtonsToRemove: ["toImage","sendDataToCloud"]});
+  </script>
+</div><div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">qwen3-4b</span>
+  </div>
+  <div id="sw_tpot_ms_qwen3_4b"></div>
+  <script>
+    Plotly.newPlot("sw_tpot_ms_qwen3_4b", [{"type": "scatter3d", "mode": "lines+markers", "x": [10, 100, 500], "y": ["May 01", "May 01", "May 01"], "z": [25.875402649398893, 25.884080055402592, 27.492373346467502], "name": "luminal backend", "legendgroup": "python_luminal", "showlegend": true, "line": {"color": "#a855f7", "width": 5}, "marker": {"color": "#a855f7", "size": 4}, "hovertemplate": "<b>luminal backend</b><br>s=%{x} iters<br>%{z:.1f} ms<br>May 01 \u00b7 b2bd91f5<extra></extra>"}, {"type": "scatter3d", "mode": "lines+markers", "x": [10, 100, 500], "y": ["May 01", "May 01", "May 01"], "z": [40.64, 39.98, 55.37], "name": "Rust (luminal)", "legendgroup": "rust", "showlegend": true, "line": {"color": "#e8855a", "width": 5}, "marker": {"color": "#e8855a", "size": 4}, "hovertemplate": "<b>Rust (luminal)</b><br>s=%{x} iters<br>%{z:.1f} ms<br>May 01 \u00b7 b2bd91f5<extra></extra>"}], {"paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9", "size": 11}, "height": 420, "margin": {"t": 20, "b": 0, "l": 0, "r": 0}, "legend": {"orientation": "h", "y": -0.05, "x": 0, "font": {"size": 11, "color": "#a1a4a5", "family": "Geist Mono, monospace"}, "bgcolor": "rgba(0,0,0,0)"}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}, "scene": {"bgcolor": "#0d1416", "xaxis": {"title": {"text": "search iters", "font": {"size": 10, "color": "#7e8385"}}, "type": "log", "tickvals": [5, 10, 20, 50, 100, 500], "ticktext": ["5", "10", "20", "50", "100", "500"], "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335", "zerolinecolor": "#2d3335"}, "yaxis": {"title": {"text": "run", "font": {"size": 10, "color": "#7e8385"}}, "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "zaxis": {"title": {"text": "ms", "font": {"size": 10, "color": "#7e8385"}}, "rangemode": "tozero", "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "ticksuffix": " ms", "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "camera": {"eye": {"x": 1.6, "y": -1.6, "z": 0.9}}}},
+      {responsive: true, displayModeBar: true, displaylogo: false,
+        modeBarButtonsToRemove: ["toImage","sendDataToCloud"]});
+  </script>
+</div><div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">gemma3-4b</span>
+  </div>
+  <div id="sw_tpot_ms_gemma3_4b"></div>
+  <script>
+    Plotly.newPlot("sw_tpot_ms_gemma3_4b", [{"type": "scatter3d", "mode": "lines+markers", "x": [10, 100, 500], "y": ["May 01", "May 01", "May 01"], "z": [37.47, 41.95, 37.25], "name": "Rust (luminal)", "legendgroup": "rust", "showlegend": true, "line": {"color": "#e8855a", "width": 5}, "marker": {"color": "#e8855a", "size": 4}, "hovertemplate": "<b>Rust (luminal)</b><br>s=%{x} iters<br>%{z:.1f} ms<br>May 01 \u00b7 b2bd91f5<extra></extra>"}], {"paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9", "size": 11}, "height": 420, "margin": {"t": 20, "b": 0, "l": 0, "r": 0}, "legend": {"orientation": "h", "y": -0.05, "x": 0, "font": {"size": 11, "color": "#a1a4a5", "family": "Geist Mono, monospace"}, "bgcolor": "rgba(0,0,0,0)"}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}, "scene": {"bgcolor": "#0d1416", "xaxis": {"title": {"text": "search iters", "font": {"size": 10, "color": "#7e8385"}}, "type": "log", "tickvals": [5, 10, 20, 50, 100, 500], "ticktext": ["5", "10", "20", "50", "100", "500"], "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335", "zerolinecolor": "#2d3335"}, "yaxis": {"title": {"text": "run", "font": {"size": 10, "color": "#7e8385"}}, "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "zaxis": {"title": {"text": "ms", "font": {"size": 10, "color": "#7e8385"}}, "rangemode": "tozero", "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "ticksuffix": " ms", "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "camera": {"eye": {"x": 1.6, "y": -1.6, "z": 0.9}}}},
+      {responsive: true, displayModeBar: true, displaylogo: false,
+        modeBarButtonsToRemove: ["toImage","sendDataToCloud"]});
+  </script>
+</div><div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">qwen3-moe</span>
+  </div>
+  <div id="sw_tpot_ms_qwen3_moe"></div>
+  <script>
+    Plotly.newPlot("sw_tpot_ms_qwen3_moe", [{"type": "scatter3d", "mode": "lines+markers", "x": [10, 100, 500], "y": ["May 01", "May 01", "May 01"], "z": [1166.6714247548953, 1167.2746865515364, 1168.7990181031637], "name": "luminal backend", "legendgroup": "python_luminal", "showlegend": true, "line": {"color": "#a855f7", "width": 5}, "marker": {"color": "#a855f7", "size": 4}, "hovertemplate": "<b>luminal backend</b><br>s=%{x} iters<br>%{z:.1f} ms<br>May 01 \u00b7 b2bd91f5<extra></extra>"}, {"type": "scatter3d", "mode": "lines+markers", "x": [10, 100, 500], "y": ["May 01", "May 01", "May 01"], "z": [59.6, 48.79, 48.88], "name": "Rust (luminal)", "legendgroup": "rust", "showlegend": true, "line": {"color": "#e8855a", "width": 5}, "marker": {"color": "#e8855a", "size": 4}, "hovertemplate": "<b>Rust (luminal)</b><br>s=%{x} iters<br>%{z:.1f} ms<br>May 01 \u00b7 b2bd91f5<extra></extra>"}], {"paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9", "size": 11}, "height": 420, "margin": {"t": 20, "b": 0, "l": 0, "r": 0}, "legend": {"orientation": "h", "y": -0.05, "x": 0, "font": {"size": 11, "color": "#a1a4a5", "family": "Geist Mono, monospace"}, "bgcolor": "rgba(0,0,0,0)"}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}, "scene": {"bgcolor": "#0d1416", "xaxis": {"title": {"text": "search iters", "font": {"size": 10, "color": "#7e8385"}}, "type": "log", "tickvals": [5, 10, 20, 50, 100, 500], "ticktext": ["5", "10", "20", "50", "100", "500"], "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335", "zerolinecolor": "#2d3335"}, "yaxis": {"title": {"text": "run", "font": {"size": 10, "color": "#7e8385"}}, "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "zaxis": {"title": {"text": "ms", "font": {"size": 10, "color": "#7e8385"}}, "rangemode": "tozero", "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "ticksuffix": " ms", "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "camera": {"eye": {"x": 1.6, "y": -1.6, "z": 0.9}}}},
+      {responsive: true, displayModeBar: true, displaylogo: false,
+        modeBarButtonsToRemove: ["toImage","sendDataToCloud"]});
+  </script>
+</div>
+  </div>
+</section>
+<section>
+  <div class="section-header">
+    <span class="section-eyebrow">sweep · 3d</span>
+    <h2 class="section-title">Time to Search <span class="unit">vs search budget · over time</span></h2>
+    <span class="section-tag">1 run</span>
+  </div>
+  <p class="sweep-hint">Drag to rotate · scroll to zoom · each curve = one run</p>
+  <div class="chart-grid" style="grid-template-columns: repeat(4, 1fr)">
+<div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">llama-8b</span>
+  </div>
+  <div id="sw_compile_ms_llama_8b"></div>
+  <script>
+    Plotly.newPlot("sw_compile_ms_llama_8b", [{"type": "scatter3d", "mode": "lines+markers", "x": [10, 100, 500], "y": ["May 01", "May 01", "May 01"], "z": [28.428826077957638, 43.57440591201885, 95.52432684396626], "name": "luminal backend", "legendgroup": "python_luminal", "showlegend": true, "line": {"color": "#a855f7", "width": 5}, "marker": {"color": "#a855f7", "size": 4}, "hovertemplate": "<b>luminal backend</b><br>s=%{x} iters<br>%{z:.1f} sec<br>May 01 \u00b7 b2bd91f5<extra></extra>"}, {"type": "scatter3d", "mode": "lines+markers", "x": [10, 100, 500], "y": ["May 01", "May 01", "May 01"], "z": [15.14307, 30.12727, 84.87889], "name": "Rust (luminal)", "legendgroup": "rust", "showlegend": true, "line": {"color": "#e8855a", "width": 5}, "marker": {"color": "#e8855a", "size": 4}, "hovertemplate": "<b>Rust (luminal)</b><br>s=%{x} iters<br>%{z:.1f} sec<br>May 01 \u00b7 b2bd91f5<extra></extra>"}], {"paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9", "size": 11}, "height": 420, "margin": {"t": 20, "b": 0, "l": 0, "r": 0}, "legend": {"orientation": "h", "y": -0.05, "x": 0, "font": {"size": 11, "color": "#a1a4a5", "family": "Geist Mono, monospace"}, "bgcolor": "rgba(0,0,0,0)"}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}, "scene": {"bgcolor": "#0d1416", "xaxis": {"title": {"text": "search iters", "font": {"size": 10, "color": "#7e8385"}}, "type": "log", "tickvals": [5, 10, 20, 50, 100, 500], "ticktext": ["5", "10", "20", "50", "100", "500"], "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335", "zerolinecolor": "#2d3335"}, "yaxis": {"title": {"text": "run", "font": {"size": 10, "color": "#7e8385"}}, "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "zaxis": {"title": {"text": "sec", "font": {"size": 10, "color": "#7e8385"}}, "rangemode": "tozero", "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "ticksuffix": " sec", "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "camera": {"eye": {"x": 1.6, "y": -1.6, "z": 0.9}}}},
+      {responsive: true, displayModeBar: true, displaylogo: false,
+        modeBarButtonsToRemove: ["toImage","sendDataToCloud"]});
+  </script>
+</div><div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">qwen3-4b</span>
+  </div>
+  <div id="sw_compile_ms_qwen3_4b"></div>
+  <script>
+    Plotly.newPlot("sw_compile_ms_qwen3_4b", [{"type": "scatter3d", "mode": "lines+markers", "x": [10, 100, 500], "y": ["May 01", "May 01", "May 01"], "z": [37.92102829599753, 54.08867314597592, 118.29659596900456], "name": "luminal backend", "legendgroup": "python_luminal", "showlegend": true, "line": {"color": "#a855f7", "width": 5}, "marker": {"color": "#a855f7", "size": 4}, "hovertemplate": "<b>luminal backend</b><br>s=%{x} iters<br>%{z:.1f} sec<br>May 01 \u00b7 b2bd91f5<extra></extra>"}, {"type": "scatter3d", "mode": "lines+markers", "x": [10, 100, 500], "y": ["May 01", "May 01", "May 01"], "z": [12.448030000000001, 27.06796, 81.89342], "name": "Rust (luminal)", "legendgroup": "rust", "showlegend": true, "line": {"color": "#e8855a", "width": 5}, "marker": {"color": "#e8855a", "size": 4}, "hovertemplate": "<b>Rust (luminal)</b><br>s=%{x} iters<br>%{z:.1f} sec<br>May 01 \u00b7 b2bd91f5<extra></extra>"}], {"paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9", "size": 11}, "height": 420, "margin": {"t": 20, "b": 0, "l": 0, "r": 0}, "legend": {"orientation": "h", "y": -0.05, "x": 0, "font": {"size": 11, "color": "#a1a4a5", "family": "Geist Mono, monospace"}, "bgcolor": "rgba(0,0,0,0)"}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}, "scene": {"bgcolor": "#0d1416", "xaxis": {"title": {"text": "search iters", "font": {"size": 10, "color": "#7e8385"}}, "type": "log", "tickvals": [5, 10, 20, 50, 100, 500], "ticktext": ["5", "10", "20", "50", "100", "500"], "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335", "zerolinecolor": "#2d3335"}, "yaxis": {"title": {"text": "run", "font": {"size": 10, "color": "#7e8385"}}, "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "zaxis": {"title": {"text": "sec", "font": {"size": 10, "color": "#7e8385"}}, "rangemode": "tozero", "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "ticksuffix": " sec", "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "camera": {"eye": {"x": 1.6, "y": -1.6, "z": 0.9}}}},
+      {responsive: true, displayModeBar: true, displaylogo: false,
+        modeBarButtonsToRemove: ["toImage","sendDataToCloud"]});
+  </script>
+</div><div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">gemma3-4b</span>
+  </div>
+  <div id="sw_compile_ms_gemma3_4b"></div>
+  <script>
+    Plotly.newPlot("sw_compile_ms_gemma3_4b", [{"type": "scatter3d", "mode": "lines+markers", "x": [10, 100, 500], "y": ["May 01", "May 01", "May 01"], "z": [102.18644, 186.34269, 498.48983000000004], "name": "Rust (luminal)", "legendgroup": "rust", "showlegend": true, "line": {"color": "#e8855a", "width": 5}, "marker": {"color": "#e8855a", "size": 4}, "hovertemplate": "<b>Rust (luminal)</b><br>s=%{x} iters<br>%{z:.1f} sec<br>May 01 \u00b7 b2bd91f5<extra></extra>"}], {"paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9", "size": 11}, "height": 420, "margin": {"t": 20, "b": 0, "l": 0, "r": 0}, "legend": {"orientation": "h", "y": -0.05, "x": 0, "font": {"size": 11, "color": "#a1a4a5", "family": "Geist Mono, monospace"}, "bgcolor": "rgba(0,0,0,0)"}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}, "scene": {"bgcolor": "#0d1416", "xaxis": {"title": {"text": "search iters", "font": {"size": 10, "color": "#7e8385"}}, "type": "log", "tickvals": [5, 10, 20, 50, 100, 500], "ticktext": ["5", "10", "20", "50", "100", "500"], "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335", "zerolinecolor": "#2d3335"}, "yaxis": {"title": {"text": "run", "font": {"size": 10, "color": "#7e8385"}}, "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "zaxis": {"title": {"text": "sec", "font": {"size": 10, "color": "#7e8385"}}, "rangemode": "tozero", "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "ticksuffix": " sec", "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "camera": {"eye": {"x": 1.6, "y": -1.6, "z": 0.9}}}},
+      {responsive: true, displayModeBar: true, displaylogo: false,
+        modeBarButtonsToRemove: ["toImage","sendDataToCloud"]});
+  </script>
+</div><div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">qwen3-moe</span>
+  </div>
+  <div id="sw_compile_ms_qwen3_moe"></div>
+  <script>
+    Plotly.newPlot("sw_compile_ms_qwen3_moe", [{"type": "scatter3d", "mode": "lines+markers", "x": [10, 100, 500], "y": ["May 01", "May 01", "May 01"], "z": [93.47603664599592, 132.266081985028, 298.05094401398674], "name": "luminal backend", "legendgroup": "python_luminal", "showlegend": true, "line": {"color": "#a855f7", "width": 5}, "marker": {"color": "#a855f7", "size": 4}, "hovertemplate": "<b>luminal backend</b><br>s=%{x} iters<br>%{z:.1f} sec<br>May 01 \u00b7 b2bd91f5<extra></extra>"}, {"type": "scatter3d", "mode": "lines+markers", "x": [10, 100, 500], "y": ["May 01", "May 01", "May 01"], "z": [25.48138, 47.5342, 134.79345], "name": "Rust (luminal)", "legendgroup": "rust", "showlegend": true, "line": {"color": "#e8855a", "width": 5}, "marker": {"color": "#e8855a", "size": 4}, "hovertemplate": "<b>Rust (luminal)</b><br>s=%{x} iters<br>%{z:.1f} sec<br>May 01 \u00b7 b2bd91f5<extra></extra>"}], {"paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9", "size": 11}, "height": 420, "margin": {"t": 20, "b": 0, "l": 0, "r": 0}, "legend": {"orientation": "h", "y": -0.05, "x": 0, "font": {"size": 11, "color": "#a1a4a5", "family": "Geist Mono, monospace"}, "bgcolor": "rgba(0,0,0,0)"}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}, "scene": {"bgcolor": "#0d1416", "xaxis": {"title": {"text": "search iters", "font": {"size": 10, "color": "#7e8385"}}, "type": "log", "tickvals": [5, 10, 20, 50, 100, 500], "ticktext": ["5", "10", "20", "50", "100", "500"], "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335", "zerolinecolor": "#2d3335"}, "yaxis": {"title": {"text": "run", "font": {"size": 10, "color": "#7e8385"}}, "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "zaxis": {"title": {"text": "sec", "font": {"size": 10, "color": "#7e8385"}}, "rangemode": "tozero", "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "ticksuffix": " sec", "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "camera": {"eye": {"x": 1.6, "y": -1.6, "z": 0.9}}}},
+      {responsive: true, displayModeBar: true, displaylogo: false,
+        modeBarButtonsToRemove: ["toImage","sendDataToCloud"]});
+  </script>
+</div>
+  </div>
+</section>
+
+</main>
+
+<footer>
+  <span>luminal · benchmark dashboard</span>
+  <span>generated May 01, 2026 · 18:56</span>
+</footer>
+
+</body>
+</html>
--- a/benchmarks/ttft/db.py
+++ b/benchmarks/ttft/db.py
@@ -0,0 +1,242 @@
+"""SQLite persistence for TTFT/TPOT benchmark runs.
+
+Two tables:
+  runs    — one row per orchestrator invocation
+  results — many rows per run, one per (path, config) combination
+
+`results` carries every field that today's BENCH_RESULT JSON record carries.
+Per-iteration sample arrays (`ttft_ms_samples`, `tpot_ms_samples`) are kept as
+JSON TEXT — they're archival, no consumer aggregates over them.
+
+The default DB path is benchmarks/ttft/bench.db (gitignored). Schema is
+created lazily on first connect.
+"""
+
+from __future__ import annotations
+
+import json
+import sqlite3
+from pathlib import Path
+from typing import Any, Iterable
+
+BENCH_DIR = Path(__file__).resolve().parent
+DEFAULT_DB_PATH = BENCH_DIR / "bench.db"
+
+
+_SCHEMA = """
+CREATE TABLE IF NOT EXISTS runs (
+  run_id        TEXT PRIMARY KEY,
+  timestamp     TEXT NOT NULL,
+  git_commit    TEXT,
+  git_branch    TEXT,
+  gpu_name      TEXT,
+  gpu_driver    TEXT,
+  gpu_vram_mb   INTEGER,
+  cuda_version  TEXT,
+  mode          TEXT NOT NULL  -- 'single' | 'all-configs' | 'search-sweep' | 'ur-test' | 'ur-test-fast'
+);
+
+CREATE TABLE IF NOT EXISTS results (
+  id              INTEGER PRIMARY KEY AUTOINCREMENT,
+  run_id          TEXT NOT NULL REFERENCES runs(run_id) ON DELETE CASCADE,
+  path            TEXT NOT NULL,
+  model           TEXT NOT NULL,
+  model_key       TEXT,
+  config          TEXT NOT NULL,
+  device          TEXT,
+  dtype           TEXT,
+  prompt_tokens   INTEGER,
+  iters           INTEGER,
+  decode_tokens   INTEGER,
+  search_iters    INTEGER,
+  ttft_ms         REAL,
+  ttft_ms_mean    REAL,
+  tpot_ms         REAL,
+  throughput_tps  REAL,
+  compile_ms      REAL,
+  note            TEXT,
+  error           TEXT,
+  ttft_ms_samples TEXT,
+  tpot_ms_samples TEXT,
+  created_at      TEXT NOT NULL DEFAULT (datetime('now'))
+);
+
+CREATE INDEX IF NOT EXISTS idx_results_run    ON results(run_id);
+CREATE INDEX IF NOT EXISTS idx_results_path   ON results(path);
+CREATE INDEX IF NOT EXISTS idx_results_config ON results(config);
+CREATE INDEX IF NOT EXISTS idx_results_modelk ON results(model_key);
+"""
+
+
+# Columns that map 1:1 from a BENCH_RESULT record dict into `results`.
+_SCALAR_RESULT_COLS = (
+    "path", "model", "model_key", "config",
+    "device", "dtype",
+    "prompt_tokens", "iters", "decode_tokens", "search_iters",
+    "ttft_ms", "ttft_ms_mean", "tpot_ms", "throughput_tps", "compile_ms",
+    "note", "error",
+)
+_SAMPLE_COLS = ("ttft_ms_samples", "tpot_ms_samples")
+_ALL_RESULT_COLS = ("run_id",) + _SCALAR_RESULT_COLS + _SAMPLE_COLS
+
+
+def connect(path: str | Path = DEFAULT_DB_PATH) -> sqlite3.Connection:
+    """Open (or create) the bench DB and ensure the schema exists."""
+    p = Path(path)
+    p.parent.mkdir(parents=True, exist_ok=True)
+    conn = sqlite3.connect(p)
+    conn.row_factory = sqlite3.Row
+    conn.execute("PRAGMA foreign_keys = ON")
+    conn.executescript(_SCHEMA)
+    return conn
+
+
+def insert_run(
+    conn: sqlite3.Connection,
+    *,
+    run_id: str,
+    timestamp: str,
+    mode: str,
+    git_commit: str | None = None,
+    git_branch: str | None = None,
+    gpu_name: str | None = None,
+    gpu_driver: str | None = None,
+    gpu_vram_mb: int | None = None,
+    cuda_version: str | None = None,
+    if_exists: str = "ignore",
+) -> str:
+    """Insert a run row.  if_exists='ignore' (default) leaves an existing
+    row untouched; 'replace' overwrites."""
+    verb = {"ignore": "INSERT OR IGNORE", "replace": "INSERT OR REPLACE"}[if_exists]
+    conn.execute(
+        f"""{verb} INTO runs
+            (run_id, timestamp, git_commit, git_branch,
+             gpu_name, gpu_driver, gpu_vram_mb, cuda_version, mode)
+            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
+        (run_id, timestamp, git_commit, git_branch,
+         gpu_name, gpu_driver, gpu_vram_mb, cuda_version, mode),
+    )
+    return run_id
+
+
+def insert_result(conn: sqlite3.Connection, run_id: str, record: dict[str, Any]) -> int:
+    """Insert one BENCH_RESULT-shaped record under the given run_id."""
+    values = [run_id]
+    for col in _SCALAR_RESULT_COLS:
+        values.append(record.get(col))
+    for col in _SAMPLE_COLS:
+        v = record.get(col)
+        values.append(json.dumps(v) if v is not None else None)
+    placeholders = ", ".join(["?"] * len(_ALL_RESULT_COLS))
+    cols = ", ".join(_ALL_RESULT_COLS)
+    cur = conn.execute(
+        f"INSERT INTO results ({cols}) VALUES ({placeholders})",
+        values,
+    )
+    return cur.lastrowid
+
+
+def insert_results(conn: sqlite3.Connection, run_id: str, records: Iterable[dict[str, Any]]) -> int:
+    """Bulk-insert; returns count."""
+    n = 0
+    for r in records:
+        insert_result(conn, run_id, r)
+        n += 1
+    return n
+
+
+def latest_run_id(conn: sqlite3.Connection) -> str | None:
+    row = conn.execute(
+        "SELECT run_id FROM runs ORDER BY timestamp DESC, run_id DESC LIMIT 1"
+    ).fetchone()
+    return row["run_id"] if row else None
+
+
+def load_run(conn: sqlite3.Connection, run_id: str) -> dict[str, Any] | None:
+    row = conn.execute("SELECT * FROM runs WHERE run_id = ?", (run_id,)).fetchone()
+    return dict(row) if row else None
+
+
+def load_runs(conn: sqlite3.Connection) -> list[dict[str, Any]]:
+    """All runs, oldest → newest."""
+    rows = conn.execute(
+        "SELECT * FROM runs ORDER BY timestamp ASC, run_id ASC"
+    ).fetchall()
+    return [dict(r) for r in rows]
+
+
+def _row_to_record(row: sqlite3.Row) -> dict[str, Any]:
+    """Convert a results row into a BENCH_RESULT-shaped dict, stripping NULLs
+    so consumers see the same shape they did with JSON."""
+    out: dict[str, Any] = {}
+    for col in _SCALAR_RESULT_COLS:
+        v = row[col]
+        if v is not None:
+            out[col] = v
+    for col in _SAMPLE_COLS:
+        v = row[col]
+        if v is not None:
+            out[col] = json.loads(v)
+    return out
+
+
+def load_results(conn: sqlite3.Connection, run_id: str) -> list[dict[str, Any]]:
+    """All results for one run, in insertion order."""
+    rows = conn.execute(
+        "SELECT * FROM results WHERE run_id = ? ORDER BY id ASC", (run_id,)
+    ).fetchall()
+    return [_row_to_record(r) for r in rows]
+
+
+def load_history(conn: sqlite3.Connection) -> list[dict[str, Any]]:
+    """Mirror the legacy gen_dashboard.load_history() shape:
+    [{"meta": {...}, "results": [...], "sweep": [...]}], sorted oldest→newest.
+    Splits results vs sweep by config-startswith('s=')."""
+    out = []
+    for run in load_runs(conn):
+        run_id = run["run_id"]
+        meta = {
+            "run_id":      run_id,
+            "timestamp":   run["timestamp"],
+            "git_commit":  run["git_commit"] or "?",
+            "git_branch":  run["git_branch"] or "?",
+        }
+        if run["gpu_name"] is not None:
+            meta["gpu_name"] = run["gpu_name"]
+        if run["gpu_driver"] is not None:
+            meta["gpu_driver"] = run["gpu_driver"]
+        if run["gpu_vram_mb"] is not None:
+            meta["gpu_vram_mb"] = run["gpu_vram_mb"]
+        if run["cuda_version"] is not None:
+            meta["cuda_version"] = run["cuda_version"]
+
+        records = load_results(conn, run_id)
+        comparison, sweep = [], []
+        for r in records:
+            (sweep if r.get("config", "").startswith("s=") else comparison).append(r)
+        out.append({"meta": meta, "results": comparison, "sweep": sweep})
+    return out
+
+
+# ── self-test ────────────────────────────────────────────────────────────────
+
+if __name__ == "__main__":
+    # In-memory smoke test: round-trip one record.
+    conn = sqlite3.connect(":memory:")
+    conn.row_factory = sqlite3.Row
+    conn.executescript(_SCHEMA)
+    insert_run(conn, run_id="test", timestamp="2026-04-27T00:00:00", mode="single")
+    insert_result(conn, "test", {
+        "path": "rust",
+        "model": "test-model",
+        "config": "default",
+        "ttft_ms": 12.34,
+        "ttft_ms_samples": [12.0, 12.5, 12.3],
+        "search_iters": 500,
+    })
+    [row] = load_results(conn, "test")
+    assert row["path"] == "rust", row
+    assert row["ttft_ms"] == 12.34, row
+    assert row["ttft_ms_samples"] == [12.0, 12.5, 12.3], row
+    assert latest_run_id(conn) == "test"
+    print("db.py smoke test ok")
--- a/benchmarks/ttft/gen_dashboard.py
+++ b/benchmarks/ttft/gen_dashboard.py
@@ -0,0 +1,832 @@
+"""Time-series benchmark dashboard generator.
+
+Reads every run from the SQLite DB (benchmarks/ttft/bench.db) and produces a
+single standalone HTML file with Plotly.js charts styled to match luminal.com.
+
+Layout:
+  TTFT over time  →  one chart per model, lines = execution paths
+  TPOT over time  →  same
+
+Usage:
+  python3 benchmarks/ttft/gen_dashboard.py [--db PATH] [--out FILE]
+"""
+
+import argparse
+import json
+from datetime import datetime
+from pathlib import Path
+
+import db
+
+BENCH_DIR = Path(__file__).resolve().parent
+
+# Path colours – kept distinct against the dark green Luminal accent
+PATH_COLORS = {
+    "python_baseline":      "#5b5f61",  # muted slate
+    "python_torch_compile": "#3b82f6",  # blue (luminal accent palette)
+    "python_luminal":       "#a855f7",  # purple (luminal accent palette)
+    "rust":                 "#e8855a",  # warm orange – Rust brand feel
+}
+PATH_LABELS = {
+    "python_baseline":      "HF Baseline",
+    "python_torch_compile": "torch.compile",
+    "python_luminal":       "luminal backend",
+    "rust":                 "Rust (luminal)",
+}
+PATH_ORDER = ["python_baseline", "python_torch_compile", "python_luminal", "rust"]
+
+# (key, short label, y-axis label, scale, axis ticksuffix)
+# scale is applied to raw value before plotting (e.g. ms → sec via 0.001).
+METRICS = [
+    ("ttft_ms",    "TTFT",            "Time to first token (ms)",   1.0,   " ms"),
+    ("tpot_ms",    "TPOT",            "Time per output token (ms)", 1.0,   " ms"),
+    ("compile_ms", "Time to Search",  "Search time (sec)",          0.001, " sec"),
+]
+
+
+# ── data loading ─────────────────────────────────────────────────────────────
+
+def load_history(db_path: Path) -> list[dict]:
+    """Return [{"meta", "results", "sweep"}, …] from the bench DB,
+    oldest→newest. Same shape the legacy JSON loader returned."""
+    if not Path(db_path).exists():
+        return []
+    conn = db.connect(db_path)
+    return db.load_history(conn)
+
+
+def build_series(runs: list[dict]) -> tuple[dict, list[str], list[str]]:
+    """Returns (data, run_ids, run_labels).
+
+    - data[model][path][metric] = [(run_id, value, commit, ts), ...]
+      `run_id` is the categorical x value; `ts` is kept for tooltip formatting.
+    - run_ids: chronological list of every run that appears in the comparison data.
+    - run_labels: parallel to run_ids; "MMM DD · HH:MM" for nice axis ticks.
+
+    The categorical x-axis (one column per run_id) replaces the previous
+    `type: date` axis. With multiple runs on the same day, the date axis
+    silently stacked them on one column; the category axis spaces them
+    evenly so each run is visually distinct.
+    """
+    data: dict = {}
+    seen_run_ids: list[str] = []
+    seen_ts: dict[str, str] = {}
+
+    for run in runs:
+        run_id = run["meta"]["run_id"]
+        ts = run["meta"]["timestamp"]
+        commit = run["meta"].get("git_commit", "?")
+        had_data = False
+        for r in run["results"]:
+            if r.get("error") or r.get("ttft_ms") is None:
+                continue
+            model = r.get("config", r.get("model", "unknown"))
+            path = r.get("path", "unknown")
+            data.setdefault(model, {}).setdefault(path, {})
+            for metric, _, _, scale, _ in METRICS:
+                val = r.get(metric)
+                if val is not None:
+                    data[model][path].setdefault(metric, []).append(
+                        (run_id, val * scale, commit, ts)
+                    )
+                    had_data = True
+        if had_data and run_id not in seen_ts:
+            seen_run_ids.append(run_id)
+            seen_ts[run_id] = ts
+
+    run_ids = sorted(seen_run_ids, key=lambda rid: seen_ts.get(rid, rid))
+    run_labels = []
+    for rid in run_ids:
+        ts = seen_ts.get(rid, rid)
+        try:
+            run_labels.append(datetime.fromisoformat(ts).strftime("%b %d · %H:%M"))
+        except ValueError:
+            run_labels.append(rid[:16].replace("T", " "))
+    return data, run_ids, run_labels
+
+
+def build_sweep_series(runs: list[dict]) -> tuple[dict, list[str]]:
+    """Collect sweep records from ALL runs for 3D charting.
+
+    Returns:
+      data[model_key][path][metric][run_id] = {
+          "label":  str,              # short date label for Y axis
+          "commit": str,
+          "points": [(iters, ms), …]  # sorted by iters
+      }
+      run_ids: list[str] in chronological order (oldest → newest)
+    """
+    data: dict = {}
+    run_ids: list[str] = []
+
+    for run in runs:
+        if not run.get("sweep"):
+            continue
+        run_id = run["meta"]["run_id"]
+        commit = run["meta"].get("git_commit", "?")
+        try:
+            label = datetime.fromisoformat(run["meta"]["timestamp"]).strftime("%b %d")
+        except ValueError:
+            label = run_id[:10]
+        if run_id not in run_ids:
+            run_ids.append(run_id)
+
+        for r in run["sweep"]:
+            if r.get("error"):
+                continue
+            n = r.get("search_iters")
+            if n is None:
+                cfg = r.get("config", "")
+                if cfg.startswith("s="):
+                    try:
+                        n = int(cfg[2:])
+                    except ValueError:
+                        continue
+            if n is None:
+                continue
+            model_key = r.get("model_key", "unknown")
+            path = r.get("path", "unknown")
+            for metric, _, _, scale, _ in METRICS:
+                val = r.get(metric)
+                if val is None:
+                    continue
+                (data
+                    .setdefault(model_key, {})
+                    .setdefault(path, {})
+                    .setdefault(metric, {})
+                    .setdefault(run_id, {"label": label, "commit": commit, "points": []})
+                    ["points"].append((n, val * scale)))
+
+    # Sort points within each run by search_iters
+    for mk in data:
+        for path in data[mk]:
+            for metric in data[mk][path]:
+                for run_id in data[mk][path][metric]:
+                    data[mk][path][metric][run_id]["points"].sort(key=lambda x: x[0])
+
+    return data, run_ids
+
+
+# ── chart building ────────────────────────────────────────────────────────────
+
+def _traces_json(path_data: dict, metric: str, show_legend: bool, unit: str = " ms") -> str:
+    traces = []
+    for path in PATH_ORDER:
+        if path not in path_data or metric not in path_data[path]:
+            continue
+        pts = path_data[path][metric]
+        # pts: list of (run_id, val, commit, ts)
+        trace = {
+            "x": [p[0] for p in pts],
+            "y": [p[1] for p in pts],
+            "customdata": [[p[2], p[3]] for p in pts],
+            "type": "scatter",
+            "mode": "lines+markers",
+            "name": PATH_LABELS.get(path, path),
+            "line": {"color": PATH_COLORS.get(path, "#aaa"), "width": 2},
+            "marker": {"size": 7, "symbol": "circle"},
+            "connectgaps": False,
+            "showlegend": show_legend,
+            "hovertemplate": (
+                f"<b>{PATH_LABELS.get(path, path)}</b><br>"
+                "%{customdata[1]}<br>"
+                f"%{{y:.1f}}{unit}<br>"
+                "<span style='color:#7e8385'>commit %{customdata[0]}</span>"
+                "<extra></extra>"
+            ),
+        }
+        traces.append(trace)
+    return json.dumps(traces)
+
+
+_CHART_LAYOUT = {
+    "plot_bgcolor":  "#0d1416",
+    "paper_bgcolor": "#141b1d",
+    "font":          {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9"},
+    "margin":        {"t": 16, "b": 48, "l": 52, "r": 12},
+    "height":        280,
+    "xaxis": {
+        # Categorical: one column per run, evenly spaced. Same-day runs
+        # used to collapse on a date axis; this keeps every run distinct.
+        "type":          "category",
+        "categoryorder": "array",  # categoryarray injected per chart
+        "color":         "#5b5f61",
+        "gridcolor":     "#1c2225",
+        "linecolor":     "#2d3335",
+        "tickfont":      {"size": 11, "family": "Geist Mono, monospace"},
+        "tickangle":     -30,
+        "automargin":    True,
+        "zeroline":      False,
+    },
+    "yaxis": {
+        "rangemode": "tozero",
+        "color":     "#5b5f61",
+        "gridcolor": "#1c2225",
+        "linecolor": "#2d3335",
+        "tickfont":  {"size": 11, "family": "Geist Mono, monospace"},
+        "ticksuffix": " ms",
+        "zeroline":  False,
+    },
+    "legend": {
+        "orientation": "h",
+        "y": -0.28,
+        "x": 0,
+        "font": {"size": 11, "color": "#a1a4a5"},
+        "bgcolor": "rgba(0,0,0,0)",
+    },
+    "hoverlabel": {
+        "bgcolor":    "#1c2225",
+        "bordercolor":"#2d3335",
+        "font":       {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"},
+    },
+}
+
+
+def _chart_card(div_id: str, model: str, traces_json: str, show_legend: bool,
+                run_ids: list[str], run_labels: list[str], unit: str = " ms") -> str:
+    layout = dict(_CHART_LAYOUT)
+    xaxis = {
+        **layout["xaxis"],
+        "categoryarray": run_ids,
+        "tickvals":      run_ids,
+        "ticktext":      run_labels,
+    }
+    layout = {**layout,
+              "xaxis": xaxis,
+              "yaxis": {**layout["yaxis"], "ticksuffix": unit}}
+    if not show_legend:
+        layout = {**layout, "legend": {**layout["legend"], "visible": False},
+                  "margin": {**layout["margin"], "b": 16}}
+    return f"""<div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">{model}</span>
+  </div>
+  <div id="{div_id}"></div>
+  <script>
+    Plotly.newPlot("{div_id}", {traces_json}, {json.dumps(layout)},
+      {{responsive: true, displayModeBar: false}});
+  </script>
+</div>"""
+
+
+def _sweep_3d_traces_json(model_data: dict, metric: str, run_ids: list[str], unit: str = " ms") -> str:
+    """One scatter3d trace per (path, run) — same colour per path, stacked by run on Y."""
+    traces = []
+    path_legend_shown: set[str] = set()
+
+    for run_id in run_ids:
+        for path in PATH_ORDER:
+            run_map = model_data.get(path, {}).get(metric, {})
+            if run_id not in run_map:
+                continue
+            entry = run_map[run_id]
+            pts = entry["points"]
+            label = entry["label"]
+            commit = entry["commit"]
+            color = PATH_COLORS.get(path, "#aaa")
+            show_legend = path not in path_legend_shown
+            path_legend_shown.add(path)
+
+            traces.append({
+                "type": "scatter3d",
+                "mode": "lines+markers",
+                "x": [p[0] for p in pts],   # search iters
+                "y": [label] * len(pts),     # run label (categorical)
+                "z": [p[1] for p in pts],    # value (already scaled by build_sweep_series)
+                "name": PATH_LABELS.get(path, path),
+                "legendgroup": path,
+                "showlegend": show_legend,
+                "line":   {"color": color, "width": 5},
+                "marker": {"color": color, "size": 4},
+                "hovertemplate": (
+                    f"<b>{PATH_LABELS.get(path, path)}</b><br>"
+                    f"s=%{{x}} iters<br>%{{z:.1f}}{unit}<br>"
+                    f"{label} · {commit}"
+                    "<extra></extra>"
+                ),
+            })
+
+    # Cross-run wire lines: for each path, connect same-budget points across
+    # runs. Makes regressions at a fixed search budget visible as a kink in the
+    # wireframe. Dashed + thinner than the per-run curves; legendgroup matches
+    # the path so toggling one toggles both.
+    for path in PATH_ORDER:
+        metric_runs = model_data.get(path, {}).get(metric, {})
+        if len(metric_runs) < 2:
+            continue
+        color = PATH_COLORS.get(path, "#aaa")
+        # by_budget[iters] -> list of (run_label, value) in chronological order
+        by_budget: dict = {}
+        for run_id in run_ids:
+            if run_id not in metric_runs:
+                continue
+            entry = metric_runs[run_id]
+            for iters, val in entry["points"]:
+                by_budget.setdefault(iters, []).append((entry["label"], val))
+        for budget, items in sorted(by_budget.items()):
+            if len(items) < 2:
+                continue
+            traces.append({
+                "type": "scatter3d",
+                "mode": "lines",
+                "x": [budget] * len(items),
+                "y": [it[0] for it in items],
+                "z": [it[1] for it in items],
+                "legendgroup": path,
+                "showlegend": False,
+                "line": {"color": color, "width": 2, "dash": "dash"},
+                "hovertemplate": (
+                    f"<b>{PATH_LABELS.get(path, path)} @ s={budget}</b><br>"
+                    f"%{{y}}: %{{z:.1f}}{unit}"
+                    "<extra></extra>"
+                ),
+            })
+    return json.dumps(traces)
+
+
+_SWEEP_3D_LAYOUT = {
+    "paper_bgcolor": "#141b1d",
+    "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9", "size": 11},
+    "height": 420,
+    "margin": {"t": 20, "b": 0, "l": 0, "r": 0},
+    "legend": {
+        "orientation": "h",
+        "y": -0.05,
+        "x": 0,
+        "font": {"size": 11, "color": "#a1a4a5", "family": "Geist Mono, monospace"},
+        "bgcolor": "rgba(0,0,0,0)",
+    },
+    "hoverlabel": {
+        "bgcolor": "#1c2225",
+        "bordercolor": "#2d3335",
+        "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"},
+    },
+    "scene": {
+        "bgcolor": "#0d1416",
+        "xaxis": {
+            "title": {"text": "search iters", "font": {"size": 10, "color": "#7e8385"}},
+            "type": "log",
+            "tickvals": [5, 10, 20, 50, 100, 500],
+            "ticktext": ["5", "10", "20", "50", "100", "500"],
+            "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"},
+            "gridcolor": "#1c2225",
+            "linecolor": "#2d3335",
+            "zerolinecolor": "#2d3335",
+        },
+        "yaxis": {
+            "title": {"text": "run", "font": {"size": 10, "color": "#7e8385"}},
+            "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"},
+            "gridcolor": "#1c2225",
+            "linecolor": "#2d3335",
+        },
+        "zaxis": {
+            "title": {"text": "ms", "font": {"size": 10, "color": "#7e8385"}},
+            "rangemode": "tozero",
+            "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"},
+            "ticksuffix": " ms",
+            "gridcolor": "#1c2225",
+            "linecolor": "#2d3335",
+        },
+        "camera": {
+            "eye": {"x": 1.6, "y": -1.6, "z": 0.9},
+        },
+    },
+}
+
+
+def _sweep_3d_card(div_id: str, model: str, traces_json: str, unit: str = " ms") -> str:
+    layout = {**_SWEEP_3D_LAYOUT,
+              "scene": {**_SWEEP_3D_LAYOUT["scene"],
+                        "zaxis": {**_SWEEP_3D_LAYOUT["scene"]["zaxis"],
+                                  "title": {**_SWEEP_3D_LAYOUT["scene"]["zaxis"]["title"],
+                                            "text": unit.strip()},
+                                  "ticksuffix": unit}}}
+    return f"""<div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">{model}</span>
+  </div>
+  <div id="{div_id}"></div>
+  <script>
+    Plotly.newPlot("{div_id}", {traces_json}, {json.dumps(layout)},
+      {{responsive: true, displayModeBar: true, displaylogo: false,
+        modeBarButtonsToRemove: ["toImage","sendDataToCloud"]}});
+  </script>
+</div>"""
+
+
+# ── HTML assembly ─────────────────────────────────────────────────────────────
+
+def build_html(runs: list[dict], data: dict,
+               run_ids: list[str], run_labels: list[str],
+               sweep_data: dict | None = None,
+               sweep_run_ids: list[str] | None = None) -> str:
+    # Preserve insertion order of models as seen across runs
+    models = list(dict.fromkeys(
+        r["config"]
+        for run in runs
+        for r in run["results"]
+        if not r.get("config", "").startswith("s=") and not r.get("error")
+    ))
+
+    last_ts = ""
+    if runs:
+        raw = runs[-1]["meta"]["timestamp"]
+        try:
+            last_ts = datetime.fromisoformat(raw).strftime("%b %d, %Y · %H:%M")
+        except ValueError:
+            last_ts = raw[:16].replace("T", " ")
+
+    n_runs = len(runs)
+
+    sections_html = ""
+    for metric_key, metric_label, ylabel, _scale, unit in METRICS:
+        active_models = [
+            m for m in models
+            if any(metric_key in data.get(m, {}).get(p, {}) for p in PATH_ORDER)
+        ]
+        if not active_models:
+            continue
+
+        cards_html = ""
+        first = True
+        for model in active_models:
+            path_data = data.get(model, {})
+            div_id = f"c_{metric_key}_{model.replace('-','_').replace('.','_')}"
+            traces = _traces_json(path_data, metric_key, show_legend=first, unit=unit)
+            cards_html += _chart_card(div_id, model, traces, show_legend=first,
+                                      run_ids=run_ids, run_labels=run_labels, unit=unit)
+            first = False
+
+        n = len(active_models)
+        # Clamp columns so charts don't get too narrow; wrap at 4
+        cols = min(n, 4)
+        sections_html += f"""
+<section>
+  <div class="section-header">
+    <span class="section-eyebrow">metric</span>
+    <h2 class="section-title">{metric_label} <span class="unit">over time</span></h2>
+    <span class="section-tag">{ylabel}</span>
+  </div>
+  <div class="chart-grid" style="grid-template-columns: repeat({cols}, 1fr)">
+{cards_html}
+  </div>
+</section>"""
+
+    # ── sweep sections (3D) ──────────────────────────────────────────────────
+    sweep_sections_html = ""
+    if sweep_data and sweep_run_ids:
+        sweep_models = list(sweep_data.keys())
+        for metric_key, metric_label, ylabel, _scale, unit in METRICS:
+            active = [
+                m for m in sweep_models
+                if any(
+                    run_id in sweep_data[m].get(p, {}).get(metric_key, {})
+                    for p in PATH_ORDER
+                    for run_id in sweep_run_ids
+                )
+            ]
+            if not active:
+                continue
+            cards_html = ""
+            for model in active:
+                div_id = f"sw_{metric_key}_{model.replace('-','_').replace('.','_')}"
+                traces = _sweep_3d_traces_json(sweep_data[model], metric_key, sweep_run_ids, unit=unit)
+                cards_html += _sweep_3d_card(div_id, model, traces, unit=unit)
+            cols = min(len(active), 4)
+            run_count = len(sweep_run_ids)
+            sweep_sections_html += f"""
+<section>
+  <div class="section-header">
+    <span class="section-eyebrow">sweep · 3d</span>
+    <h2 class="section-title">{metric_label} <span class="unit">vs search budget · over time</span></h2>
+    <span class="section-tag">{run_count} run{"s" if run_count != 1 else ""}</span>
+  </div>
+  <p class="sweep-hint">Drag to rotate · scroll to zoom · each curve = one run</p>
+  <div class="chart-grid" style="grid-template-columns: repeat({cols}, 1fr)">
+{cards_html}
+  </div>
+</section>"""
+
+    return f"""<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<title>Luminal · Benchmark Dashboard</title>
+<link rel="preconnect" href="https://fonts.googleapis.com">
+<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+<link href="https://fonts.googleapis.com/css2?family=Geist:wght@300;400;500;600&family=Geist+Mono:wght@300;400;500&display=swap" rel="stylesheet">
+<script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
+<style>
+*, *::before, *::after {{ box-sizing: border-box; margin: 0; padding: 0; }}
+html {{ -webkit-font-smoothing: antialiased; scroll-behavior: smooth; }}
+
+body {{
+  font-family: 'Geist', system-ui, sans-serif;
+  background: #030712;
+  color: #d7d8d9;
+  min-height: 100vh;
+  line-height: 1.5;
+}}
+
+/* ── NAV ── */
+nav {{
+  position: sticky;
+  top: 0;
+  z-index: 50;
+  height: 56px;
+  background: rgba(8, 15, 17, 0.92);
+  backdrop-filter: blur(8px);
+  -webkit-backdrop-filter: blur(8px);
+  border-bottom: 1px solid #2d3335;
+  display: flex;
+  align-items: center;
+  padding: 0 24px;
+  gap: 0;
+}}
+.nav-brand {{
+  display: flex;
+  align-items: center;
+  gap: 8px;
+  font-family: 'Geist Mono', monospace;
+  font-size: 14px;
+  font-weight: 500;
+  letter-spacing: 0.05em;
+  color: #2faa6e;
+  text-decoration: none;
+}}
+.nav-dot {{
+  width: 6px;
+  height: 6px;
+  background: #2faa6e;
+  border-radius: 50%;
+  flex-shrink: 0;
+  animation: pulse-glow 2s ease-in-out infinite;
+}}
+.nav-sep {{
+  color: #2d3335;
+  margin: 0 14px;
+  font-size: 18px;
+  font-weight: 300;
+}}
+.nav-page {{
+  font-family: 'Geist Mono', monospace;
+  font-size: 11px;
+  letter-spacing: 0.1em;
+  text-transform: uppercase;
+  color: #7e8385;
+}}
+
+@keyframes pulse-glow {{
+  0%, 100% {{ opacity: 1; }}
+  50%       {{ opacity: 0.35; }}
+}}
+
+/* ── MAIN ── */
+main {{
+  max-width: 1200px;
+  margin: 0 auto;
+  padding: 40px 24px 80px;
+}}
+
+/* ── PAGE HEADER ── */
+.page-header {{
+  margin-bottom: 40px;
+  padding-bottom: 32px;
+  border-bottom: 1px solid #1c2225;
+}}
+.page-eyebrow {{
+  font-family: 'Geist Mono', monospace;
+  font-size: 11px;
+  letter-spacing: 0.1em;
+  text-transform: uppercase;
+  color: #2faa6e;
+  margin-bottom: 10px;
+}}
+.page-title {{
+  font-size: 30px;
+  font-weight: 500;
+  letter-spacing: -0.025em;
+  color: #d7d8d9;
+  margin-bottom: 10px;
+}}
+.page-meta {{
+  font-size: 14px;
+  color: #7e8385;
+  display: flex;
+  align-items: center;
+  gap: 0;
+  flex-wrap: wrap;
+}}
+.meta-sep {{
+  font-family: 'Geist Mono', monospace;
+  color: #2d3335;
+  margin: 0 10px;
+}}
+.meta-val {{
+  font-family: 'Geist Mono', monospace;
+  font-size: 13px;
+  color: #5b5f61;
+}}
+
+/* ── LEGEND STRIP ── */
+.legend-strip {{
+  display: flex;
+  flex-wrap: wrap;
+  gap: 6px;
+  margin-bottom: 32px;
+}}
+.legend-pill {{
+  display: flex;
+  align-items: center;
+  gap: 6px;
+  font-family: 'Geist Mono', monospace;
+  font-size: 11px;
+  letter-spacing: 0.04em;
+  color: #a1a4a5;
+  background: #141b1d;
+  border: 1px solid #2d3335;
+  border-radius: 2px;
+  padding: 4px 10px;
+}}
+.legend-swatch {{
+  width: 8px;
+  height: 8px;
+  border-radius: 50%;
+  flex-shrink: 0;
+}}
+
+/* ── SECTIONS ── */
+section {{ margin-bottom: 48px; }}
+.section-header {{
+  display: flex;
+  align-items: baseline;
+  gap: 10px;
+  margin-bottom: 16px;
+  padding-bottom: 12px;
+  border-bottom: 1px solid #1c2225;
+  flex-wrap: wrap;
+}}
+.section-eyebrow {{
+  font-family: 'Geist Mono', monospace;
+  font-size: 11px;
+  letter-spacing: 0.1em;
+  text-transform: uppercase;
+  color: #404647;
+}}
+.section-title {{
+  font-size: 18px;
+  font-weight: 500;
+  color: #d7d8d9;
+  letter-spacing: -0.01em;
+}}
+.section-title .unit {{
+  color: #7e8385;
+  font-weight: 400;
+}}
+.section-tag {{
+  font-family: 'Geist Mono', monospace;
+  font-size: 11px;
+  letter-spacing: 0.04em;
+  text-transform: uppercase;
+  color: #2faa6e;
+  background: #162322;
+  border: 1px solid #1c372e;
+  padding: 2px 8px;
+  border-radius: 2px;
+  margin-left: auto;
+}}
+
+/* ── CHART GRID ── */
+.chart-grid {{
+  display: grid;
+  gap: 10px;
+}}
+.chart-card {{
+  background: #141b1d;
+  border: 1px solid #2d3335;
+  border-radius: 2px;
+  overflow: hidden;
+  transition: border-color 150ms;
+  min-width: 0;
+}}
+.chart-card:hover {{ border-color: #404647; }}
+.chart-card-header {{
+  padding: 10px 14px 0;
+  display: flex;
+  align-items: center;
+}}
+.model-tag {{
+  font-family: 'Geist Mono', monospace;
+  font-size: 11px;
+  letter-spacing: 0.06em;
+  text-transform: uppercase;
+  color: #7e8385;
+}}
+
+/* ── FOOTER ── */
+footer {{
+  max-width: 1200px;
+  margin: 0 auto;
+  padding: 20px 24px;
+  border-top: 1px solid #1c2225;
+  font-family: 'Geist Mono', monospace;
+  font-size: 11px;
+  letter-spacing: 0.04em;
+  color: #404647;
+  display: flex;
+  justify-content: space-between;
+  flex-wrap: wrap;
+  gap: 8px;
+}}
+
+.section-divider {{
+  border: none;
+  border-top: 1px solid #1c2225;
+  margin: 8px 0 40px;
+}}
+.sweep-hint {{
+  font-family: 'Geist Mono', monospace;
+  font-size: 11px;
+  letter-spacing: 0.04em;
+  color: #404647;
+  margin-bottom: 12px;
+}}
+
+@media (max-width: 768px) {{
+  .chart-grid {{ grid-template-columns: 1fr !important; }}
+  .page-title {{ font-size: 22px; }}
+}}
+</style>
+</head>
+<body>
+
+<nav>
+  <a class="nav-brand" href="https://luminal.com">
+    <span class="nav-dot"></span>luminal
+  </a>
+  <span class="nav-sep">/</span>
+  <span class="nav-page">benchmarks</span>
+</nav>
+
+<main>
+
+<header class="page-header">
+  <p class="page-eyebrow">performance · time-series</p>
+  <h1 class="page-title">Benchmark Dashboard</h1>
+  <div class="page-meta">
+    <span>Last updated</span>
+    <span class="meta-sep">·</span>
+    <span class="meta-val">{last_ts}</span>
+    <span class="meta-sep">·</span>
+    <span class="meta-val">{n_runs} run{"s" if n_runs != 1 else ""} in history</span>
+  </div>
+</header>
+
+<div class="legend-strip">
+  {"".join(
+      f'<div class="legend-pill"><span class="legend-swatch" style="background:{PATH_COLORS[p]}"></span>{PATH_LABELS[p]}</div>'
+      for p in PATH_ORDER
+  )}
+</div>
+
+{sections_html}
+{"<hr class='section-divider'>" + sweep_sections_html if sweep_sections_html else ""}
+
+</main>
+
+<footer>
+  <span>luminal · benchmark dashboard</span>
+  <span>generated {last_ts}</span>
+</footer>
+
+</body>
+</html>
+"""
+
+
+# ── entry point ───────────────────────────────────────────────────────────────
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--db", default=str(db.DEFAULT_DB_PATH),
+                    help=f"SQLite bench DB (default: {db.DEFAULT_DB_PATH})")
+    ap.add_argument("--out", default=str(BENCH_DIR / "dashboard.html"),
+                    help="Output HTML file")
+    args = ap.parse_args()
+
+    runs = load_history(Path(args.db))
+    if not runs:
+        print(f"No runs found in {args.db}. Run --ur-test (or backfill) first.")
+        return
+
+    data, run_ids, run_labels = build_series(runs)
+    sweep_data, sweep_run_ids = build_sweep_series(runs)
+    html = build_html(runs, data, run_ids, run_labels, sweep_data, sweep_run_ids)
+    Path(args.out).write_text(html)
+
+    print(f"wrote {args.out}  ({len(runs)} runs, {sum(len(v) for v in data.values())} model×path series)")
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks/ttft/gen_report.py
+++ b/benchmarks/ttft/gen_report.py
@@ -0,0 +1,349 @@
+#!/usr/bin/env python3
+"""Generate a standalone HTML benchmark report from a single benchmark run.
+
+Usage:
+    python3 gen_report.py [--db PATH] [--run RUN_ID] [--out report.html] [--title "..."]
+
+Sections are split out of a single run automatically:
+  - per-model_key, "comparison" (configs not matching s=N)  →  grouped bar chart
+  - per-model_key, "sweep" (configs matching s=N)           →  line chart (log X)
+For runs without model_key (e.g. single-config runs), one section per detected
+shape is produced instead.
+"""
+
+import argparse
+import json
+import re
+import sys
+from pathlib import Path
+
+import db
+
+PATH_ORDER = ["python_baseline", "python_torch_compile", "python_luminal", "rust"]
+PATH_LABELS = {
+    "python_baseline":      "HF Baseline",
+    "python_torch_compile": "torch.compile",
+    "python_luminal":       "luminal backend",
+    "rust":                 "Rust (luminal)",
+}
+PATH_COLORS = {
+    "python_baseline":      "#888888",
+    "python_torch_compile": "#5ab552",
+    "python_luminal":       "#4c9ed9",
+    "rust":                 "#d97a4c",
+}
+
+
+# ── helpers ──────────────────────────────────────────────────────────────────
+
+def _fmt(v, decimals=1, suffix=""):
+    return f"{v:.{decimals}f}{suffix}" if v is not None else "—"
+
+def _section_title(path: Path) -> str:
+    stem = path.stem.replace("_", " ").replace("-", " ")
+    return stem.title()
+
+def _is_sweep(configs: list[str]) -> bool:
+    return bool(configs) and all(re.fullmatch(r"s=\d+", c) for c in configs)
+
+def _group_by_config(results: list[dict]) -> dict[str, dict[str, dict]]:
+    """Return {config: {path: result_dict}}."""
+    out: dict[str, dict[str, dict]] = {}
+    for r in results:
+        cfg = r.get("config", "default")
+        out.setdefault(cfg, {})[r["path"]] = r
+    return out
+
+
+# ── chart builders (return Plotly figure dicts) ───────────────────────────────
+
+def _bar_figure(by_config: dict, metric: str, title: str,
+                scale: float = 1.0, unit: str = "ms") -> dict:
+    configs = list(by_config.keys())
+    traces = []
+    for path in PATH_ORDER:
+        ys, texts = [], []
+        for cfg in configs:
+            r = by_config[cfg].get(path)
+            raw = r.get(metric) if r and not r.get("error") else None
+            v = raw * scale if raw is not None else None
+            ys.append(v if v is not None else 0)
+            texts.append(f"{v:.1f} {unit}" if v is not None else "n/a")
+        if any(y > 0 for y in ys):
+            traces.append({
+                "type": "bar",
+                "name": PATH_LABELS.get(path, path),
+                "x": configs,
+                "y": ys,
+                "text": texts,
+                "textposition": "outside",
+                "marker": {"color": PATH_COLORS.get(path, "#aaaaaa")},
+                "hovertemplate": "%{x}<br>" + PATH_LABELS.get(path, path)
+                                 + f": %{{y:.1f}} {unit}<extra></extra>",
+            })
+    return {
+        "data": traces,
+        "layout": {
+            "title": title,
+            "yaxis": {"title": unit, "rangemode": "tozero"},
+            "barmode": "group",
+            "legend": {"orientation": "h", "y": -0.2},
+            "margin": {"t": 50, "b": 80},
+            "plot_bgcolor": "#fafafa",
+            "paper_bgcolor": "#ffffff",
+        },
+    }
+
+
+def _line_figure(by_config: dict, metric: str, title: str,
+                 scale: float = 1.0, unit: str = "ms") -> dict:
+    """Line chart for sweep data. Config names are 's=N'; X = N (log scale)."""
+    def _iter(cfg):
+        m = re.fullmatch(r"s=(\d+)", cfg)
+        return int(m.group(1)) if m else 0
+
+    configs_sorted = sorted(by_config.keys(), key=_iter)
+    xs = [_iter(c) for c in configs_sorted]
+
+    paths_present = {p for cfg in by_config.values() for p in cfg}
+    traces = []
+    for path in PATH_ORDER:
+        if path not in paths_present:
+            continue
+        ys = []
+        for cfg in configs_sorted:
+            r = by_config[cfg].get(path)
+            raw = r.get(metric) if r and not r.get("error") else None
+            ys.append(raw * scale if raw is not None else None)
+        if any(y is not None for y in ys):
+            traces.append({
+                "type": "scatter",
+                "mode": "lines+markers",
+                "name": PATH_LABELS.get(path, path),
+                "x": xs,
+                "y": ys,
+                "marker": {"size": 8, "color": PATH_COLORS.get(path, "#aaaaaa")},
+                "line": {"color": PATH_COLORS.get(path, "#aaaaaa"), "width": 2},
+                "hovertemplate": "iters=%{x}<br>" + PATH_LABELS.get(path, path)
+                                 + f": %{{y:.1f}} {unit}<extra></extra>",
+            })
+    return {
+        "data": traces,
+        "layout": {
+            "title": title,
+            "xaxis": {"title": "Search iterations", "type": "log",
+                      "tickvals": xs, "ticktext": [str(x) for x in xs]},
+            "yaxis": {"title": unit, "rangemode": "tozero"},
+            "legend": {"orientation": "h", "y": -0.25},
+            "margin": {"t": 50, "b": 90},
+            "plot_bgcolor": "#fafafa",
+            "paper_bgcolor": "#ffffff",
+        },
+    }
+
+
+# ── table builder ─────────────────────────────────────────────────────────────
+
+def _table_html(results: list[dict]) -> str:
+    rows = []
+    for r in sorted(results, key=lambda r: (r.get("config", ""), PATH_ORDER.index(r["path"]) if r["path"] in PATH_ORDER else 99)):
+        error = r.get("error")
+        style = ' style="background:#fff0f0"' if error else ""
+        path_label = PATH_LABELS.get(r["path"], r["path"])
+        cfg = r.get("config", "—")
+        ttft = _fmt(r.get("ttft_ms"), 1, " ms")
+        tpot = _fmt(r.get("tpot_ms"), 1, " ms")
+        tput = _fmt(r.get("throughput_tps"), 1, " tok/s")
+        comp = _fmt(r.get("compile_ms"), 0, " ms") if r.get("compile_ms") else "—"
+        ptok = str(r.get("prompt_tokens", "—"))
+        note = (r.get("error") or r.get("note") or "")[:90]
+        note_style = ' style="color:#c00"' if error else ' style="color:#777"'
+        rows.append(
+            f'<tr{style}>'
+            f'<td>{path_label}</td><td>{cfg}</td>'
+            f'<td>{ttft}</td><td>{tpot}</td><td>{tput}</td>'
+            f'<td>{comp}</td><td>{ptok}</td>'
+            f'<td{note_style}>{note}</td>'
+            f'</tr>'
+        )
+    return (
+        '<table>'
+        '<thead><tr>'
+        '<th>Path</th><th>Config</th>'
+        '<th>TTFT</th><th>TPOT</th><th>Throughput</th>'
+        '<th>Compile</th><th>Prompt tokens</th><th>Note</th>'
+        '</tr></thead>'
+        '<tbody>' + "\n".join(rows) + '</tbody>'
+        '</table>'
+    )
+
+
+# ── section builder ───────────────────────────────────────────────────────────
+
+def _section_html(sec_id: str, title: str, results: list[dict], fig_counter: list) -> str:
+    by_config = _group_by_config(results)
+    configs = list(by_config.keys())
+    sweep = _is_sweep(configs)
+
+    models = list(dict.fromkeys(r.get("model", "") for r in results if r.get("model")))
+    model_str = ", ".join(models) if models else "—"
+    prompt_tokens = list(dict.fromkeys(r.get("prompt_tokens") for r in results if r.get("prompt_tokens")))
+    tok_str = "/".join(str(t) for t in prompt_tokens) + " prompt tokens" if prompt_tokens else ""
+
+    builder = _line_figure if sweep else _bar_figure
+    ttft_fig = builder(by_config, "ttft_ms", "TTFT")
+    has_tpot = any(r.get("tpot_ms") is not None for r in results if not r.get("error"))
+    tpot_fig = builder(by_config, "tpot_ms", "TPOT") if has_tpot else None
+    has_compile = any(r.get("compile_ms") is not None and r.get("compile_ms") > 0
+                      for r in results if not r.get("error"))
+    compile_fig = (builder(by_config, "compile_ms", "Time to Search",
+                           scale=0.001, unit="sec")
+                   if has_compile else None)
+
+    def chart_div(fig):
+        n = fig_counter[0]
+        fig_counter[0] += 1
+        return (
+            f'<div id="fig{n}" class="chart"></div>'
+            f'<script>Plotly.newPlot("fig{n}", {json.dumps(fig["data"])}, {json.dumps(fig["layout"])}, {{responsive:true}});</script>'
+        )
+
+    charts_html = f'<div class="charts-row">{chart_div(ttft_fig)}'
+    if tpot_fig:
+        charts_html += chart_div(tpot_fig)
+    if compile_fig:
+        charts_html += chart_div(compile_fig)
+    charts_html += '</div>'
+
+    return f"""
+<section id="{sec_id}">
+  <h2>{title}</h2>
+  <p class="meta">{model_str}{" · " + tok_str if tok_str else ""} · {len(results)} results</p>
+  {charts_html}
+  {_table_html(results)}
+</section>
+"""
+
+
+# ── full page ─────────────────────────────────────────────────────────────────
+
+CSS = """
+* { box-sizing: border-box; margin: 0; padding: 0; }
+body { font-family: system-ui, sans-serif; background: #f0f2f5; color: #222; }
+header { background: #1a1a2e; color: #fff; padding: 1rem 2rem;
+         position: sticky; top: 0; z-index: 100; display: flex;
+         align-items: center; gap: 2rem; }
+header h1 { font-size: 1.2rem; white-space: nowrap; }
+nav a { color: #a0c4ff; text-decoration: none; font-size: 0.9rem;
+        padding: 0.3rem 0.7rem; border-radius: 4px; white-space: nowrap; }
+nav a:hover { background: rgba(255,255,255,0.15); }
+main { max-width: 1400px; margin: 0 auto; padding: 2rem; display: flex;
+       flex-direction: column; gap: 2.5rem; }
+section { background: #fff; border-radius: 8px; padding: 1.5rem 2rem;
+          box-shadow: 0 1px 4px rgba(0,0,0,.08); }
+h2 { font-size: 1.3rem; margin-bottom: 0.4rem; }
+.meta { color: #666; font-size: 0.85rem; margin-bottom: 1.2rem; }
+.charts-row { display: flex; gap: 1.5rem; flex-wrap: wrap; margin-bottom: 1.5rem; }
+.chart { flex: 1; min-width: 340px; height: 360px; }
+table { width: 100%; border-collapse: collapse; font-size: 0.82rem; }
+thead tr { background: #f5f5f5; }
+th, td { padding: 0.45rem 0.7rem; text-align: left;
+         border-bottom: 1px solid #e8e8e8; }
+th { font-weight: 600; white-space: nowrap; }
+tr:last-child td { border-bottom: none; }
+tr:hover { background: #fafafa; }
+"""
+
+def _build_html(sections: list[tuple[str, str, list[dict]]], title: str) -> str:
+    nav_links = "".join(f'<a href="#{sid}">{stitle}</a>' for sid, stitle, _ in sections)
+    fig_counter = [0]
+    body = "".join(_section_html(sid, stitle, results, fig_counter)
+                   for sid, stitle, results in sections)
+    return f"""<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<title>{title}</title>
+<script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
+<style>{CSS}</style>
+</head>
+<body>
+<header>
+  <h1>{title}</h1>
+  <nav>{nav_links}</nav>
+</header>
+<main>{body}</main>
+</body>
+</html>"""
+
+
+# ── CLI ───────────────────────────────────────────────────────────────────────
+
+def _sections_for_run(results: list[dict]) -> list[tuple[str, str, list[dict]]]:
+    """Split a single run's results into (sec_id, title, records) sections.
+
+    Splits first by model_key (NULL → 'results'), then within each by
+    sweep-vs-comparison based on config 's=N' shape."""
+    by_key: dict[str | None, list[dict]] = {}
+    for r in results:
+        by_key.setdefault(r.get("model_key"), []).append(r)
+
+    sections: list[tuple[str, str, list[dict]]] = []
+    for key, recs in by_key.items():
+        comp, sweep = [], []
+        for r in recs:
+            (sweep if str(r.get("config", "")).startswith("s=") else comp).append(r)
+        prefix = (key or "results").replace("-", "_").replace(".", "_")
+        title_prefix = key or "Results"
+        if comp:
+            sections.append((f"{prefix}_comparison",
+                             f"{title_prefix} comparison".strip().title(),
+                             comp))
+        if sweep:
+            sections.append((f"{prefix}_sweep",
+                             f"{title_prefix} sweep".strip().title(),
+                             sweep))
+    return sections
+
+
+def main():
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("--db", default=str(db.DEFAULT_DB_PATH),
+                    help=f"SQLite bench DB (default: {db.DEFAULT_DB_PATH})")
+    ap.add_argument("--run", default=None,
+                    help="Run ID to render (default: latest run in DB)")
+    ap.add_argument("--out", default=None,
+                    help="Output HTML path (default: report.html in benchmarks/ttft/)")
+    ap.add_argument("--title", default="Luminal TTFT Benchmark Report",
+                    help="Page title and heading")
+    args = ap.parse_args()
+
+    if not Path(args.db).exists():
+        print(f"DB not found: {args.db}", file=sys.stderr)
+        sys.exit(1)
+
+    conn = db.connect(args.db)
+    run_id = args.run or db.latest_run_id(conn)
+    if run_id is None:
+        print(f"No runs in {args.db}", file=sys.stderr)
+        sys.exit(1)
+
+    results = db.load_results(conn, run_id)
+    if not results:
+        print(f"No results for run {run_id}", file=sys.stderr)
+        sys.exit(1)
+
+    sections = _sections_for_run(results)
+    if not sections:
+        print(f"No section data for run {run_id}", file=sys.stderr)
+        sys.exit(1)
+
+    out = Path(args.out) if args.out else Path(__file__).parent / "report.html"
+    html = _build_html(sections, f"{args.title} — {run_id}")
+    out.write_text(html)
+    print(f"wrote {out}  (run {run_id}, {len(sections)} sections, {len(results)} results)")
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks/ttft/report.html
+++ b/benchmarks/ttft/report.html
@@ -0,0 +1,148 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<title>Luminal TTFT Benchmark Report — 2026-05-01T18-56-26-996695</title>
+<script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
+<style>
+* { box-sizing: border-box; margin: 0; padding: 0; }
+body { font-family: system-ui, sans-serif; background: #f0f2f5; color: #222; }
+header { background: #1a1a2e; color: #fff; padding: 1rem 2rem;
+         position: sticky; top: 0; z-index: 100; display: flex;
+         align-items: center; gap: 2rem; }
+header h1 { font-size: 1.2rem; white-space: nowrap; }
+nav a { color: #a0c4ff; text-decoration: none; font-size: 0.9rem;
+        padding: 0.3rem 0.7rem; border-radius: 4px; white-space: nowrap; }
+nav a:hover { background: rgba(255,255,255,0.15); }
+main { max-width: 1400px; margin: 0 auto; padding: 2rem; display: flex;
+       flex-direction: column; gap: 2.5rem; }
+section { background: #fff; border-radius: 8px; padding: 1.5rem 2rem;
+          box-shadow: 0 1px 4px rgba(0,0,0,.08); }
+h2 { font-size: 1.3rem; margin-bottom: 0.4rem; }
+.meta { color: #666; font-size: 0.85rem; margin-bottom: 1.2rem; }
+.charts-row { display: flex; gap: 1.5rem; flex-wrap: wrap; margin-bottom: 1.5rem; }
+.chart { flex: 1; min-width: 340px; height: 360px; }
+table { width: 100%; border-collapse: collapse; font-size: 0.82rem; }
+thead tr { background: #f5f5f5; }
+th, td { padding: 0.45rem 0.7rem; text-align: left;
+         border-bottom: 1px solid #e8e8e8; }
+th { font-weight: 600; white-space: nowrap; }
+tr:last-child td { border-bottom: none; }
+tr:hover { background: #fafafa; }
+</style>
+</head>
+<body>
+<header>
+  <h1>Luminal TTFT Benchmark Report — 2026-05-01T18-56-26-996695</h1>
+  <nav><a href="#llama_8b_comparison">Llama-8B Comparison</a><a href="#llama_8b_sweep">Llama-8B Sweep</a><a href="#qwen3_4b_comparison">Qwen3-4B Comparison</a><a href="#qwen3_4b_sweep">Qwen3-4B Sweep</a><a href="#gemma3_4b_comparison">Gemma3-4B Comparison</a><a href="#gemma3_4b_sweep">Gemma3-4B Sweep</a><a href="#gemma4_moe_comparison">Gemma4-Moe Comparison</a><a href="#gemma4_moe_sweep">Gemma4-Moe Sweep</a><a href="#qwen3_moe_comparison">Qwen3-Moe Comparison</a><a href="#qwen3_moe_sweep">Qwen3-Moe Sweep</a></nav>
+</header>
+<main>
+<section id="llama_8b_comparison">
+  <h2>Llama-8B Comparison</h2>
+  <p class="meta">NousResearch/Meta-Llama-3-8B-Instruct · 21 prompt tokens · 4 results</p>
+  <div class="charts-row"><div id="fig0" class="chart"></div><script>Plotly.newPlot("fig0", [{"type": "bar", "name": "HF Baseline", "x": ["llama-8b"], "y": [705.9654394979589], "text": ["706.0 ms"], "textposition": "outside", "marker": {"color": "#888888"}, "hovertemplate": "%{x}<br>HF Baseline: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "torch.compile", "x": ["llama-8b"], "y": [307.66548847896047], "text": ["307.7 ms"], "textposition": "outside", "marker": {"color": "#5ab552"}, "hovertemplate": "%{x}<br>torch.compile: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "luminal backend", "x": ["llama-8b"], "y": [461.48114453535527], "text": ["461.5 ms"], "textposition": "outside", "marker": {"color": "#4c9ed9"}, "hovertemplate": "%{x}<br>luminal backend: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "Rust (luminal)", "x": ["llama-8b"], "y": [1026.86], "text": ["1026.9 ms"], "textposition": "outside", "marker": {"color": "#d97a4c"}, "hovertemplate": "%{x}<br>Rust (luminal): %{y:.1f} ms<extra></extra>"}], {"title": "TTFT", "yaxis": {"title": "ms", "rangemode": "tozero"}, "barmode": "group", "legend": {"orientation": "h", "y": -0.2}, "margin": {"t": 50, "b": 80}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script><div id="fig1" class="chart"></div><script>Plotly.newPlot("fig1", [{"type": "bar", "name": "HF Baseline", "x": ["llama-8b"], "y": [34.15271903970279], "text": ["34.2 ms"], "textposition": "outside", "marker": {"color": "#888888"}, "hovertemplate": "%{x}<br>HF Baseline: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "torch.compile", "x": ["llama-8b"], "y": [171.7862353892997], "text": ["171.8 ms"], "textposition": "outside", "marker": {"color": "#5ab552"}, "hovertemplate": "%{x}<br>torch.compile: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "luminal backend", "x": ["llama-8b"], "y": [23.078908618772402], "text": ["23.1 ms"], "textposition": "outside", "marker": {"color": "#4c9ed9"}, "hovertemplate": "%{x}<br>luminal backend: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "Rust (luminal)", "x": ["llama-8b"], "y": [51.64], "text": ["51.6 ms"], "textposition": "outside", "marker": {"color": "#d97a4c"}, "hovertemplate": "%{x}<br>Rust (luminal): %{y:.1f} ms<extra></extra>"}], {"title": "TPOT", "yaxis": {"title": "ms", "rangemode": "tozero"}, "barmode": "group", "legend": {"orientation": "h", "y": -0.2}, "margin": {"t": 50, "b": 80}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script><div id="fig2" class="chart"></div><script>Plotly.newPlot("fig2", [{"type": "bar", "name": "torch.compile", "x": ["llama-8b"], "y": [18.760145067994017], "text": ["18.8 sec"], "textposition": "outside", "marker": {"color": "#5ab552"}, "hovertemplate": "%{x}<br>torch.compile: %{y:.1f} sec<extra></extra>"}, {"type": "bar", "name": "luminal backend", "x": ["llama-8b"], "y": [95.96263545705006], "text": ["96.0 sec"], "textposition": "outside", "marker": {"color": "#4c9ed9"}, "hovertemplate": "%{x}<br>luminal backend: %{y:.1f} sec<extra></extra>"}, {"type": "bar", "name": "Rust (luminal)", "x": ["llama-8b"], "y": [84.45343], "text": ["84.5 sec"], "textposition": "outside", "marker": {"color": "#d97a4c"}, "hovertemplate": "%{x}<br>Rust (luminal): %{y:.1f} sec<extra></extra>"}], {"title": "Time to Search", "yaxis": {"title": "sec", "rangemode": "tozero"}, "barmode": "group", "legend": {"orientation": "h", "y": -0.2}, "margin": {"t": 50, "b": 80}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script></div>
+  <table><thead><tr><th>Path</th><th>Config</th><th>TTFT</th><th>TPOT</th><th>Throughput</th><th>Compile</th><th>Prompt tokens</th><th>Note</th></tr></thead><tbody><tr><td>HF Baseline</td><td>llama-8b</td><td>706.0 ms</td><td>34.2 ms</td><td>29.3 tok/s</td><td>—</td><td>21</td><td style="color:#777">sequential per-token, StaticCache KV cache</td></tr>
+<tr><td>torch.compile</td><td>llama-8b</td><td>307.7 ms</td><td>171.8 ms</td><td>5.8 tok/s</td><td>18760 ms</td><td>21</td><td style="color:#777">sequential per-token, StaticCache KV cache (torch.compile inductor)</td></tr>
+<tr><td>luminal backend</td><td>llama-8b</td><td>461.5 ms</td><td>23.1 ms</td><td>43.3 tok/s</td><td>95963 ms</td><td>21</td><td style="color:#777">sequential per-token, StaticCache KV cache</td></tr>
+<tr><td>Rust (luminal)</td><td>llama-8b</td><td>1026.9 ms</td><td>51.6 ms</td><td>19.4 tok/s</td><td>84453 ms</td><td>21</td><td style="color:#777">sum of per-token prefill durations</td></tr></tbody></table>
+</section>
+
+<section id="llama_8b_sweep">
+  <h2>Llama-8B Sweep</h2>
+  <p class="meta">NousResearch/Meta-Llama-3-8B-Instruct · 21 prompt tokens · 6 results</p>
+  <div class="charts-row"><div id="fig3" class="chart"></div><script>Plotly.newPlot("fig3", [{"type": "scatter", "mode": "lines+markers", "name": "luminal backend", "x": [10, 100, 500], "y": [470.7036415056791, 460.72837291285396, 472.43661794345826], "marker": {"size": 8, "color": "#4c9ed9"}, "line": {"color": "#4c9ed9", "width": 2}, "hovertemplate": "iters=%{x}<br>luminal backend: %{y:.1f} ms<extra></extra>"}, {"type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "x": [10, 100, 500], "y": [751.03, 1038.34, 453.16], "marker": {"size": 8, "color": "#d97a4c"}, "line": {"color": "#d97a4c", "width": 2}, "hovertemplate": "iters=%{x}<br>Rust (luminal): %{y:.1f} ms<extra></extra>"}], {"title": "TTFT", "xaxis": {"title": "Search iterations", "type": "log", "tickvals": [10, 100, 500], "ticktext": ["10", "100", "500"]}, "yaxis": {"title": "ms", "rangemode": "tozero"}, "legend": {"orientation": "h", "y": -0.25}, "margin": {"t": 50, "b": 90}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script><div id="fig4" class="chart"></div><script>Plotly.newPlot("fig4", [{"type": "scatter", "mode": "lines+markers", "name": "luminal backend", "x": [10, 100, 500], "y": [23.540849717101082, 23.101884137140587, 23.610779400914907], "marker": {"size": 8, "color": "#4c9ed9"}, "line": {"color": "#4c9ed9", "width": 2}, "hovertemplate": "iters=%{x}<br>luminal backend: %{y:.1f} ms<extra></extra>"}, {"type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "x": [10, 100, 500], "y": [38.2, 51.92, 24.09], "marker": {"size": 8, "color": "#d97a4c"}, "line": {"color": "#d97a4c", "width": 2}, "hovertemplate": "iters=%{x}<br>Rust (luminal): %{y:.1f} ms<extra></extra>"}], {"title": "TPOT", "xaxis": {"title": "Search iterations", "type": "log", "tickvals": [10, 100, 500], "ticktext": ["10", "100", "500"]}, "yaxis": {"title": "ms", "rangemode": "tozero"}, "legend": {"orientation": "h", "y": -0.25}, "margin": {"t": 50, "b": 90}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script><div id="fig5" class="chart"></div><script>Plotly.newPlot("fig5", [{"type": "scatter", "mode": "lines+markers", "name": "luminal backend", "x": [10, 100, 500], "y": [28.428826077957638, 43.57440591201885, 95.52432684396626], "marker": {"size": 8, "color": "#4c9ed9"}, "line": {"color": "#4c9ed9", "width": 2}, "hovertemplate": "iters=%{x}<br>luminal backend: %{y:.1f} sec<extra></extra>"}, {"type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "x": [10, 100, 500], "y": [15.14307, 30.12727, 84.87889], "marker": {"size": 8, "color": "#d97a4c"}, "line": {"color": "#d97a4c", "width": 2}, "hovertemplate": "iters=%{x}<br>Rust (luminal): %{y:.1f} sec<extra></extra>"}], {"title": "Time to Search", "xaxis": {"title": "Search iterations", "type": "log", "tickvals": [10, 100, 500], "ticktext": ["10", "100", "500"]}, "yaxis": {"title": "sec", "rangemode": "tozero"}, "legend": {"orientation": "h", "y": -0.25}, "margin": {"t": 50, "b": 90}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script></div>
+  <table><thead><tr><th>Path</th><th>Config</th><th>TTFT</th><th>TPOT</th><th>Throughput</th><th>Compile</th><th>Prompt tokens</th><th>Note</th></tr></thead><tbody><tr><td>luminal backend</td><td>s=10</td><td>470.7 ms</td><td>23.5 ms</td><td>42.5 tok/s</td><td>28429 ms</td><td>21</td><td style="color:#777">sequential per-token, StaticCache KV cache</td></tr>
+<tr><td>Rust (luminal)</td><td>s=10</td><td>751.0 ms</td><td>38.2 ms</td><td>26.2 tok/s</td><td>15143 ms</td><td>21</td><td style="color:#777">sum of per-token prefill durations</td></tr>
+<tr><td>luminal backend</td><td>s=100</td><td>460.7 ms</td><td>23.1 ms</td><td>43.3 tok/s</td><td>43574 ms</td><td>21</td><td style="color:#777">sequential per-token, StaticCache KV cache</td></tr>
+<tr><td>Rust (luminal)</td><td>s=100</td><td>1038.3 ms</td><td>51.9 ms</td><td>19.3 tok/s</td><td>30127 ms</td><td>21</td><td style="color:#777">sum of per-token prefill durations</td></tr>
+<tr><td>luminal backend</td><td>s=500</td><td>472.4 ms</td><td>23.6 ms</td><td>42.4 tok/s</td><td>95524 ms</td><td>21</td><td style="color:#777">sequential per-token, StaticCache KV cache</td></tr>
+<tr><td>Rust (luminal)</td><td>s=500</td><td>453.2 ms</td><td>24.1 ms</td><td>41.5 tok/s</td><td>84879 ms</td><td>21</td><td style="color:#777">sum of per-token prefill durations</td></tr></tbody></table>
+</section>
+
+<section id="qwen3_4b_comparison">
+  <h2>Qwen3-4B Comparison</h2>
+  <p class="meta">Qwen/Qwen3-4B · 19/11 prompt tokens · 4 results</p>
+  <div class="charts-row"><div id="fig6" class="chart"></div><script>Plotly.newPlot("fig6", [{"type": "bar", "name": "HF Baseline", "x": ["qwen3-4b"], "y": [869.2860195587855], "text": ["869.3 ms"], "textposition": "outside", "marker": {"color": "#888888"}, "hovertemplate": "%{x}<br>HF Baseline: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "torch.compile", "x": ["qwen3-4b"], "y": [298.27259748708457], "text": ["298.3 ms"], "textposition": "outside", "marker": {"color": "#5ab552"}, "hovertemplate": "%{x}<br>torch.compile: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "luminal backend", "x": ["qwen3-4b"], "y": [485.3892414830625], "text": ["485.4 ms"], "textposition": "outside", "marker": {"color": "#4c9ed9"}, "hovertemplate": "%{x}<br>luminal backend: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "Rust (luminal)", "x": ["qwen3-4b"], "y": [398.58], "text": ["398.6 ms"], "textposition": "outside", "marker": {"color": "#d97a4c"}, "hovertemplate": "%{x}<br>Rust (luminal): %{y:.1f} ms<extra></extra>"}], {"title": "TTFT", "yaxis": {"title": "ms", "rangemode": "tozero"}, "barmode": "group", "legend": {"orientation": "h", "y": -0.2}, "margin": {"t": 50, "b": 80}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script><div id="fig7" class="chart"></div><script>Plotly.newPlot("fig7", [{"type": "bar", "name": "HF Baseline", "x": ["qwen3-4b"], "y": [47.71483448566869], "text": ["47.7 ms"], "textposition": "outside", "marker": {"color": "#888888"}, "hovertemplate": "%{x}<br>HF Baseline: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "torch.compile", "x": ["qwen3-4b"], "y": [468.56868775503244], "text": ["468.6 ms"], "textposition": "outside", "marker": {"color": "#5ab552"}, "hovertemplate": "%{x}<br>torch.compile: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "luminal backend", "x": ["qwen3-4b"], "y": [26.90318431414198], "text": ["26.9 ms"], "textposition": "outside", "marker": {"color": "#4c9ed9"}, "hovertemplate": "%{x}<br>luminal backend: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "Rust (luminal)", "x": ["qwen3-4b"], "y": [40.62], "text": ["40.6 ms"], "textposition": "outside", "marker": {"color": "#d97a4c"}, "hovertemplate": "%{x}<br>Rust (luminal): %{y:.1f} ms<extra></extra>"}], {"title": "TPOT", "yaxis": {"title": "ms", "rangemode": "tozero"}, "barmode": "group", "legend": {"orientation": "h", "y": -0.2}, "margin": {"t": 50, "b": 80}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script><div id="fig8" class="chart"></div><script>Plotly.newPlot("fig8", [{"type": "bar", "name": "torch.compile", "x": ["qwen3-4b"], "y": [4.680963660997804], "text": ["4.7 sec"], "textposition": "outside", "marker": {"color": "#5ab552"}, "hovertemplate": "%{x}<br>torch.compile: %{y:.1f} sec<extra></extra>"}, {"type": "bar", "name": "luminal backend", "x": ["qwen3-4b"], "y": [45.345814052037895], "text": ["45.3 sec"], "textposition": "outside", "marker": {"color": "#4c9ed9"}, "hovertemplate": "%{x}<br>luminal backend: %{y:.1f} sec<extra></extra>"}, {"type": "bar", "name": "Rust (luminal)", "x": ["qwen3-4b"], "y": [19.92977], "text": ["19.9 sec"], "textposition": "outside", "marker": {"color": "#d97a4c"}, "hovertemplate": "%{x}<br>Rust (luminal): %{y:.1f} sec<extra></extra>"}], {"title": "Time to Search", "yaxis": {"title": "sec", "rangemode": "tozero"}, "barmode": "group", "legend": {"orientation": "h", "y": -0.2}, "margin": {"t": 50, "b": 80}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script></div>
+  <table><thead><tr><th>Path</th><th>Config</th><th>TTFT</th><th>TPOT</th><th>Throughput</th><th>Compile</th><th>Prompt tokens</th><th>Note</th></tr></thead><tbody><tr><td>HF Baseline</td><td>qwen3-4b</td><td>869.3 ms</td><td>47.7 ms</td><td>21.0 tok/s</td><td>—</td><td>19</td><td style="color:#777">sequential per-token, StaticCache KV cache</td></tr>
+<tr><td>torch.compile</td><td>qwen3-4b</td><td>298.3 ms</td><td>468.6 ms</td><td>2.1 tok/s</td><td>4681 ms</td><td>19</td><td style="color:#777">sequential per-token, StaticCache KV cache (torch.compile inductor)</td></tr>
+<tr><td>luminal backend</td><td>qwen3-4b</td><td>485.4 ms</td><td>26.9 ms</td><td>37.2 tok/s</td><td>45346 ms</td><td>19</td><td style="color:#777">sequential per-token, StaticCache KV cache</td></tr>
+<tr><td>Rust (luminal)</td><td>qwen3-4b</td><td>398.6 ms</td><td>40.6 ms</td><td>24.6 tok/s</td><td>19930 ms</td><td>11</td><td style="color:#777">sum of per-token prefill durations</td></tr></tbody></table>
+</section>
+
+<section id="qwen3_4b_sweep">
+  <h2>Qwen3-4B Sweep</h2>
+  <p class="meta">Qwen/Qwen3-4B · 19/11 prompt tokens · 6 results</p>
+  <div class="charts-row"><div id="fig9" class="chart"></div><script>Plotly.newPlot("fig9", [{"type": "scatter", "mode": "lines+markers", "name": "luminal backend", "x": [10, 100, 500], "y": [465.02652901108377, 465.9317950136028, 495.75577257201076], "marker": {"size": 8, "color": "#4c9ed9"}, "line": {"color": "#4c9ed9", "width": 2}, "hovertemplate": "iters=%{x}<br>luminal backend: %{y:.1f} ms<extra></extra>"}, {"type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "x": [10, 100, 500], "y": [398.44, 390.08, 559.29], "marker": {"size": 8, "color": "#d97a4c"}, "line": {"color": "#d97a4c", "width": 2}, "hovertemplate": "iters=%{x}<br>Rust (luminal): %{y:.1f} ms<extra></extra>"}], {"title": "TTFT", "xaxis": {"title": "Search iterations", "type": "log", "tickvals": [10, 100, 500], "ticktext": ["10", "100", "500"]}, "yaxis": {"title": "ms", "rangemode": "tozero"}, "legend": {"orientation": "h", "y": -0.25}, "margin": {"t": 50, "b": 90}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script><div id="fig10" class="chart"></div><script>Plotly.newPlot("fig10", [{"type": "scatter", "mode": "lines+markers", "name": "luminal backend", "x": [10, 100, 500], "y": [25.875402649398893, 25.884080055402592, 27.492373346467502], "marker": {"size": 8, "color": "#4c9ed9"}, "line": {"color": "#4c9ed9", "width": 2}, "hovertemplate": "iters=%{x}<br>luminal backend: %{y:.1f} ms<extra></extra>"}, {"type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "x": [10, 100, 500], "y": [40.64, 39.98, 55.37], "marker": {"size": 8, "color": "#d97a4c"}, "line": {"color": "#d97a4c", "width": 2}, "hovertemplate": "iters=%{x}<br>Rust (luminal): %{y:.1f} ms<extra></extra>"}], {"title": "TPOT", "xaxis": {"title": "Search iterations", "type": "log", "tickvals": [10, 100, 500], "ticktext": ["10", "100", "500"]}, "yaxis": {"title": "ms", "rangemode": "tozero"}, "legend": {"orientation": "h", "y": -0.25}, "margin": {"t": 50, "b": 90}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script><div id="fig11" class="chart"></div><script>Plotly.newPlot("fig11", [{"type": "scatter", "mode": "lines+markers", "name": "luminal backend", "x": [10, 100, 500], "y": [37.92102829599753, 54.08867314597592, 118.29659596900456], "marker": {"size": 8, "color": "#4c9ed9"}, "line": {"color": "#4c9ed9", "width": 2}, "hovertemplate": "iters=%{x}<br>luminal backend: %{y:.1f} sec<extra></extra>"}, {"type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "x": [10, 100, 500], "y": [12.448030000000001, 27.06796, 81.89342], "marker": {"size": 8, "color": "#d97a4c"}, "line": {"color": "#d97a4c", "width": 2}, "hovertemplate": "iters=%{x}<br>Rust (luminal): %{y:.1f} sec<extra></extra>"}], {"title": "Time to Search", "xaxis": {"title": "Search iterations", "type": "log", "tickvals": [10, 100, 500], "ticktext": ["10", "100", "500"]}, "yaxis": {"title": "sec", "rangemode": "tozero"}, "legend": {"orientation": "h", "y": -0.25}, "margin": {"t": 50, "b": 90}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script></div>
+  <table><thead><tr><th>Path</th><th>Config</th><th>TTFT</th><th>TPOT</th><th>Throughput</th><th>Compile</th><th>Prompt tokens</th><th>Note</th></tr></thead><tbody><tr><td>luminal backend</td><td>s=10</td><td>465.0 ms</td><td>25.9 ms</td><td>38.6 tok/s</td><td>37921 ms</td><td>19</td><td style="color:#777">sequential per-token, StaticCache KV cache</td></tr>
+<tr><td>Rust (luminal)</td><td>s=10</td><td>398.4 ms</td><td>40.6 ms</td><td>24.6 tok/s</td><td>12448 ms</td><td>11</td><td style="color:#777">sum of per-token prefill durations</td></tr>
+<tr><td>luminal backend</td><td>s=100</td><td>465.9 ms</td><td>25.9 ms</td><td>38.6 tok/s</td><td>54089 ms</td><td>19</td><td style="color:#777">sequential per-token, StaticCache KV cache</td></tr>
+<tr><td>Rust (luminal)</td><td>s=100</td><td>390.1 ms</td><td>40.0 ms</td><td>25.0 tok/s</td><td>27068 ms</td><td>11</td><td style="color:#777">sum of per-token prefill durations</td></tr>
+<tr><td>luminal backend</td><td>s=500</td><td>495.8 ms</td><td>27.5 ms</td><td>36.4 tok/s</td><td>118297 ms</td><td>19</td><td style="color:#777">sequential per-token, StaticCache KV cache</td></tr>
+<tr><td>Rust (luminal)</td><td>s=500</td><td>559.3 ms</td><td>55.4 ms</td><td>18.1 tok/s</td><td>81893 ms</td><td>11</td><td style="color:#777">sum of per-token prefill durations</td></tr></tbody></table>
+</section>
+
+<section id="gemma3_4b_comparison">
+  <h2>Gemma3-4B Comparison</h2>
+  <p class="meta">unsloth/gemma-3-4b-it · 19/11 prompt tokens · 4 results</p>
+  <div class="charts-row"><div id="fig12" class="chart"></div><script>Plotly.newPlot("fig12", [{"type": "bar", "name": "HF Baseline", "x": ["gemma3-4b"], "y": [951.1196144158021], "text": ["951.1 ms"], "textposition": "outside", "marker": {"color": "#888888"}, "hovertemplate": "%{x}<br>HF Baseline: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "torch.compile", "x": ["gemma3-4b"], "y": [300.9451600664761], "text": ["300.9 ms"], "textposition": "outside", "marker": {"color": "#5ab552"}, "hovertemplate": "%{x}<br>torch.compile: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "Rust (luminal)", "x": ["gemma3-4b"], "y": [404.43], "text": ["404.4 ms"], "textposition": "outside", "marker": {"color": "#d97a4c"}, "hovertemplate": "%{x}<br>Rust (luminal): %{y:.1f} ms<extra></extra>"}], {"title": "TTFT", "yaxis": {"title": "ms", "rangemode": "tozero"}, "barmode": "group", "legend": {"orientation": "h", "y": -0.2}, "margin": {"t": 50, "b": 80}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script><div id="fig13" class="chart"></div><script>Plotly.newPlot("fig13", [{"type": "bar", "name": "HF Baseline", "x": ["gemma3-4b"], "y": [52.498737201676704], "text": ["52.5 ms"], "textposition": "outside", "marker": {"color": "#888888"}, "hovertemplate": "%{x}<br>HF Baseline: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "torch.compile", "x": ["gemma3-4b"], "y": [2197.426627812092], "text": ["2197.4 ms"], "textposition": "outside", "marker": {"color": "#5ab552"}, "hovertemplate": "%{x}<br>torch.compile: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "Rust (luminal)", "x": ["gemma3-4b"], "y": [38.99], "text": ["39.0 ms"], "textposition": "outside", "marker": {"color": "#d97a4c"}, "hovertemplate": "%{x}<br>Rust (luminal): %{y:.1f} ms<extra></extra>"}], {"title": "TPOT", "yaxis": {"title": "ms", "rangemode": "tozero"}, "barmode": "group", "legend": {"orientation": "h", "y": -0.2}, "margin": {"t": 50, "b": 80}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script><div id="fig14" class="chart"></div><script>Plotly.newPlot("fig14", [{"type": "bar", "name": "torch.compile", "x": ["gemma3-4b"], "y": [26.649526304972824], "text": ["26.6 sec"], "textposition": "outside", "marker": {"color": "#5ab552"}, "hovertemplate": "%{x}<br>torch.compile: %{y:.1f} sec<extra></extra>"}, {"type": "bar", "name": "Rust (luminal)", "x": ["gemma3-4b"], "y": [156.84164], "text": ["156.8 sec"], "textposition": "outside", "marker": {"color": "#d97a4c"}, "hovertemplate": "%{x}<br>Rust (luminal): %{y:.1f} sec<extra></extra>"}], {"title": "Time to Search", "yaxis": {"title": "sec", "rangemode": "tozero"}, "barmode": "group", "legend": {"orientation": "h", "y": -0.2}, "margin": {"t": 50, "b": 80}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script></div>
+  <table><thead><tr><th>Path</th><th>Config</th><th>TTFT</th><th>TPOT</th><th>Throughput</th><th>Compile</th><th>Prompt tokens</th><th>Note</th></tr></thead><tbody><tr><td>HF Baseline</td><td>gemma3-4b</td><td>951.1 ms</td><td>52.5 ms</td><td>19.0 tok/s</td><td>—</td><td>19</td><td style="color:#777">sequential per-token, StaticCache KV cache</td></tr>
+<tr><td>torch.compile</td><td>gemma3-4b</td><td>300.9 ms</td><td>2197.4 ms</td><td>0.5 tok/s</td><td>26650 ms</td><td>19</td><td style="color:#777">sequential per-token, StaticCache KV cache (torch.compile inductor)</td></tr>
+<tr style="background:#fff0f0"><td>luminal backend</td><td>gemma3-4b</td><td>—</td><td>—</td><td>—</td><td>—</td><td>—</td><td style="color:#c00">bench_python_luminal.py failed with code 1</td></tr>
+<tr><td>Rust (luminal)</td><td>gemma3-4b</td><td>404.4 ms</td><td>39.0 ms</td><td>25.6 tok/s</td><td>156842 ms</td><td>11</td><td style="color:#777">sum of per-token prefill durations</td></tr></tbody></table>
+</section>
+
+<section id="gemma3_4b_sweep">
+  <h2>Gemma3-4B Sweep</h2>
+  <p class="meta">unsloth/gemma-3-4b-it · 11 prompt tokens · 6 results</p>
+  <div class="charts-row"><div id="fig15" class="chart"></div><script>Plotly.newPlot("fig15", [{"type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "x": [10, 100, 500], "y": [388.19, 436.49, 386.13], "marker": {"size": 8, "color": "#d97a4c"}, "line": {"color": "#d97a4c", "width": 2}, "hovertemplate": "iters=%{x}<br>Rust (luminal): %{y:.1f} ms<extra></extra>"}], {"title": "TTFT", "xaxis": {"title": "Search iterations", "type": "log", "tickvals": [10, 100, 500], "ticktext": ["10", "100", "500"]}, "yaxis": {"title": "ms", "rangemode": "tozero"}, "legend": {"orientation": "h", "y": -0.25}, "margin": {"t": 50, "b": 90}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script><div id="fig16" class="chart"></div><script>Plotly.newPlot("fig16", [{"type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "x": [10, 100, 500], "y": [37.47, 41.95, 37.25], "marker": {"size": 8, "color": "#d97a4c"}, "line": {"color": "#d97a4c", "width": 2}, "hovertemplate": "iters=%{x}<br>Rust (luminal): %{y:.1f} ms<extra></extra>"}], {"title": "TPOT", "xaxis": {"title": "Search iterations", "type": "log", "tickvals": [10, 100, 500], "ticktext": ["10", "100", "500"]}, "yaxis": {"title": "ms", "rangemode": "tozero"}, "legend": {"orientation": "h", "y": -0.25}, "margin": {"t": 50, "b": 90}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script><div id="fig17" class="chart"></div><script>Plotly.newPlot("fig17", [{"type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "x": [10, 100, 500], "y": [102.18644, 186.34269, 498.48983000000004], "marker": {"size": 8, "color": "#d97a4c"}, "line": {"color": "#d97a4c", "width": 2}, "hovertemplate": "iters=%{x}<br>Rust (luminal): %{y:.1f} sec<extra></extra>"}], {"title": "Time to Search", "xaxis": {"title": "Search iterations", "type": "log", "tickvals": [10, 100, 500], "ticktext": ["10", "100", "500"]}, "yaxis": {"title": "sec", "rangemode": "tozero"}, "legend": {"orientation": "h", "y": -0.25}, "margin": {"t": 50, "b": 90}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script></div>
+  <table><thead><tr><th>Path</th><th>Config</th><th>TTFT</th><th>TPOT</th><th>Throughput</th><th>Compile</th><th>Prompt tokens</th><th>Note</th></tr></thead><tbody><tr style="background:#fff0f0"><td>luminal backend</td><td>s=10</td><td>—</td><td>—</td><td>—</td><td>—</td><td>—</td><td style="color:#c00">bench_python_luminal.py failed with code 1</td></tr>
+<tr><td>Rust (luminal)</td><td>s=10</td><td>388.2 ms</td><td>37.5 ms</td><td>26.7 tok/s</td><td>102186 ms</td><td>11</td><td style="color:#777">sum of per-token prefill durations</td></tr>
+<tr style="background:#fff0f0"><td>luminal backend</td><td>s=100</td><td>—</td><td>—</td><td>—</td><td>—</td><td>—</td><td style="color:#c00">bench_python_luminal.py failed with code 1</td></tr>
+<tr><td>Rust (luminal)</td><td>s=100</td><td>436.5 ms</td><td>42.0 ms</td><td>23.8 tok/s</td><td>186343 ms</td><td>11</td><td style="color:#777">sum of per-token prefill durations</td></tr>
+<tr style="background:#fff0f0"><td>luminal backend</td><td>s=500</td><td>—</td><td>—</td><td>—</td><td>—</td><td>—</td><td style="color:#c00">bench_python_luminal.py failed with code 1</td></tr>
+<tr><td>Rust (luminal)</td><td>s=500</td><td>386.1 ms</td><td>37.2 ms</td><td>26.8 tok/s</td><td>498490 ms</td><td>11</td><td style="color:#777">sum of per-token prefill durations</td></tr></tbody></table>
+</section>
+
+<section id="gemma4_moe_comparison">
+  <h2>Gemma4-Moe Comparison</h2>
+  <p class="meta">google/gemma-4-26B-A4B · 11 prompt tokens · 4 results</p>
+  <div class="charts-row"><div id="fig18" class="chart"></div><script>Plotly.newPlot("fig18", [{"type": "bar", "name": "HF Baseline", "x": ["gemma4-moe"], "y": [837.3980740143452], "text": ["837.4 ms"], "textposition": "outside", "marker": {"color": "#888888"}, "hovertemplate": "%{x}<br>HF Baseline: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "torch.compile", "x": ["gemma4-moe"], "y": [245.510076492792], "text": ["245.5 ms"], "textposition": "outside", "marker": {"color": "#5ab552"}, "hovertemplate": "%{x}<br>torch.compile: %{y:.1f} ms<extra></extra>"}], {"title": "TTFT", "yaxis": {"title": "ms", "rangemode": "tozero"}, "barmode": "group", "legend": {"orientation": "h", "y": -0.2}, "margin": {"t": 50, "b": 80}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script><div id="fig19" class="chart"></div><script>Plotly.newPlot("fig19", [{"type": "bar", "name": "HF Baseline", "x": ["gemma4-moe"], "y": [83.64427039632574], "text": ["83.6 ms"], "textposition": "outside", "marker": {"color": "#888888"}, "hovertemplate": "%{x}<br>HF Baseline: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "torch.compile", "x": ["gemma4-moe"], "y": [654.9649795080768], "text": ["655.0 ms"], "textposition": "outside", "marker": {"color": "#5ab552"}, "hovertemplate": "%{x}<br>torch.compile: %{y:.1f} ms<extra></extra>"}], {"title": "TPOT", "yaxis": {"title": "ms", "rangemode": "tozero"}, "barmode": "group", "legend": {"orientation": "h", "y": -0.2}, "margin": {"t": 50, "b": 80}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script><div id="fig20" class="chart"></div><script>Plotly.newPlot("fig20", [{"type": "bar", "name": "torch.compile", "x": ["gemma4-moe"], "y": [38.81582092499593], "text": ["38.8 sec"], "textposition": "outside", "marker": {"color": "#5ab552"}, "hovertemplate": "%{x}<br>torch.compile: %{y:.1f} sec<extra></extra>"}], {"title": "Time to Search", "yaxis": {"title": "sec", "rangemode": "tozero"}, "barmode": "group", "legend": {"orientation": "h", "y": -0.2}, "margin": {"t": 50, "b": 80}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script></div>
+  <table><thead><tr><th>Path</th><th>Config</th><th>TTFT</th><th>TPOT</th><th>Throughput</th><th>Compile</th><th>Prompt tokens</th><th>Note</th></tr></thead><tbody><tr><td>HF Baseline</td><td>gemma4-moe</td><td>837.4 ms</td><td>83.6 ms</td><td>12.0 tok/s</td><td>—</td><td>11</td><td style="color:#777">sequential per-token, StaticCache KV cache</td></tr>
+<tr><td>torch.compile</td><td>gemma4-moe</td><td>245.5 ms</td><td>655.0 ms</td><td>1.5 tok/s</td><td>38816 ms</td><td>11</td><td style="color:#777">sequential per-token, StaticCache KV cache (torch.compile inductor)</td></tr>
+<tr style="background:#fff0f0"><td>luminal backend</td><td>gemma4-moe</td><td>—</td><td>—</td><td>—</td><td>—</td><td>—</td><td style="color:#c00">bench_python_luminal.py failed with code -9</td></tr>
+<tr style="background:#fff0f0"><td>Rust (luminal)</td><td>gemma4-moe</td><td>—</td><td>—</td><td>—</td><td>—</td><td>—</td><td style="color:#c00">rust bench failed with code -9</td></tr></tbody></table>
+</section>
+
+<section id="gemma4_moe_sweep">
+  <h2>Gemma4-Moe Sweep</h2>
+  <p class="meta">google/gemma-4-26B-A4B · 2 results</p>
+  <div class="charts-row"><div id="fig21" class="chart"></div><script>Plotly.newPlot("fig21", [], {"title": "TTFT", "xaxis": {"title": "Search iterations", "type": "log", "tickvals": [10], "ticktext": ["10"]}, "yaxis": {"title": "ms", "rangemode": "tozero"}, "legend": {"orientation": "h", "y": -0.25}, "margin": {"t": 50, "b": 90}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script></div>
+  <table><thead><tr><th>Path</th><th>Config</th><th>TTFT</th><th>TPOT</th><th>Throughput</th><th>Compile</th><th>Prompt tokens</th><th>Note</th></tr></thead><tbody><tr style="background:#fff0f0"><td>luminal backend</td><td>s=10</td><td>—</td><td>—</td><td>—</td><td>—</td><td>—</td><td style="color:#c00">bench_python_luminal.py failed with code -9</td></tr>
+<tr style="background:#fff0f0"><td>Rust (luminal)</td><td>s=10</td><td>—</td><td>—</td><td>—</td><td>—</td><td>—</td><td style="color:#c00">rust bench failed with code -9</td></tr></tbody></table>
+</section>
+
+<section id="qwen3_moe_comparison">
+  <h2>Qwen3-Moe Comparison</h2>
+  <p class="meta">Qwen/Qwen3-30B-A3B · 19 prompt tokens · 4 results</p>
+  <div class="charts-row"><div id="fig22" class="chart"></div><script>Plotly.newPlot("fig22", [{"type": "bar", "name": "HF Baseline", "x": ["qwen3-moe"], "y": [1565.540504961973], "text": ["1565.5 ms"], "textposition": "outside", "marker": {"color": "#888888"}, "hovertemplate": "%{x}<br>HF Baseline: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "torch.compile", "x": ["qwen3-moe"], "y": [460.077923577046], "text": ["460.1 ms"], "textposition": "outside", "marker": {"color": "#5ab552"}, "hovertemplate": "%{x}<br>torch.compile: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "luminal backend", "x": ["qwen3-moe"], "y": [21002.791983017232], "text": ["21002.8 ms"], "textposition": "outside", "marker": {"color": "#4c9ed9"}, "hovertemplate": "%{x}<br>luminal backend: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "Rust (luminal)", "x": ["qwen3-moe"], "y": [662.07], "text": ["662.1 ms"], "textposition": "outside", "marker": {"color": "#d97a4c"}, "hovertemplate": "%{x}<br>Rust (luminal): %{y:.1f} ms<extra></extra>"}], {"title": "TTFT", "yaxis": {"title": "ms", "rangemode": "tozero"}, "barmode": "group", "legend": {"orientation": "h", "y": -0.2}, "margin": {"t": 50, "b": 80}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script><div id="fig23" class="chart"></div><script>Plotly.newPlot("fig23", [{"type": "bar", "name": "HF Baseline", "x": ["qwen3-moe"], "y": [84.527321747737], "text": ["84.5 ms"], "textposition": "outside", "marker": {"color": "#888888"}, "hovertemplate": "%{x}<br>HF Baseline: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "torch.compile", "x": ["qwen3-moe"], "y": [753.0061075551203], "text": ["753.0 ms"], "textposition": "outside", "marker": {"color": "#5ab552"}, "hovertemplate": "%{x}<br>torch.compile: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "luminal backend", "x": ["qwen3-moe"], "y": [1166.8824461026816], "text": ["1166.9 ms"], "textposition": "outside", "marker": {"color": "#4c9ed9"}, "hovertemplate": "%{x}<br>luminal backend: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "Rust (luminal)", "x": ["qwen3-moe"], "y": [60.08], "text": ["60.1 ms"], "textposition": "outside", "marker": {"color": "#d97a4c"}, "hovertemplate": "%{x}<br>Rust (luminal): %{y:.1f} ms<extra></extra>"}], {"title": "TPOT", "yaxis": {"title": "ms", "rangemode": "tozero"}, "barmode": "group", "legend": {"orientation": "h", "y": -0.2}, "margin": {"t": 50, "b": 80}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script><div id="fig24" class="chart"></div><script>Plotly.newPlot("fig24", [{"type": "bar", "name": "torch.compile", "x": ["qwen3-moe"], "y": [8.341281775035895], "text": ["8.3 sec"], "textposition": "outside", "marker": {"color": "#5ab552"}, "hovertemplate": "%{x}<br>torch.compile: %{y:.1f} sec<extra></extra>"}, {"type": "bar", "name": "luminal backend", "x": ["qwen3-moe"], "y": [111.70731823903043], "text": ["111.7 sec"], "textposition": "outside", "marker": {"color": "#4c9ed9"}, "hovertemplate": "%{x}<br>luminal backend: %{y:.1f} sec<extra></extra>"}, {"type": "bar", "name": "Rust (luminal)", "x": ["qwen3-moe"], "y": [80.83241000000001], "text": ["80.8 sec"], "textposition": "outside", "marker": {"color": "#d97a4c"}, "hovertemplate": "%{x}<br>Rust (luminal): %{y:.1f} sec<extra></extra>"}], {"title": "Time to Search", "yaxis": {"title": "sec", "rangemode": "tozero"}, "barmode": "group", "legend": {"orientation": "h", "y": -0.2}, "margin": {"t": 50, "b": 80}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script></div>
+  <table><thead><tr><th>Path</th><th>Config</th><th>TTFT</th><th>TPOT</th><th>Throughput</th><th>Compile</th><th>Prompt tokens</th><th>Note</th></tr></thead><tbody><tr><td>HF Baseline</td><td>qwen3-moe</td><td>1565.5 ms</td><td>84.5 ms</td><td>11.8 tok/s</td><td>—</td><td>19</td><td style="color:#777">sequential per-token, StaticCache KV cache</td></tr>
+<tr><td>torch.compile</td><td>qwen3-moe</td><td>460.1 ms</td><td>753.0 ms</td><td>1.3 tok/s</td><td>8341 ms</td><td>19</td><td style="color:#777">sequential per-token, StaticCache KV cache (torch.compile inductor)</td></tr>
+<tr><td>luminal backend</td><td>qwen3-moe</td><td>21002.8 ms</td><td>1166.9 ms</td><td>0.9 tok/s</td><td>111707 ms</td><td>19</td><td style="color:#777">sequential per-token, StaticCache KV cache</td></tr>
+<tr><td>Rust (luminal)</td><td>qwen3-moe</td><td>662.1 ms</td><td>60.1 ms</td><td>16.6 tok/s</td><td>80832 ms</td><td>—</td><td style="color:#777">sum of per-token prefill durations</td></tr></tbody></table>
+</section>
+
+<section id="qwen3_moe_sweep">
+  <h2>Qwen3-Moe Sweep</h2>
+  <p class="meta">Qwen/Qwen3-30B-A3B · 19 prompt tokens · 6 results</p>
+  <div class="charts-row"><div id="fig25" class="chart"></div><script>Plotly.newPlot("fig25", [{"type": "scatter", "mode": "lines+markers", "name": "luminal backend", "x": [10, 100, 500], "y": [21002.663500519702, 21018.686580006033, 21034.366824431345], "marker": {"size": 8, "color": "#4c9ed9"}, "line": {"color": "#4c9ed9", "width": 2}, "hovertemplate": "iters=%{x}<br>luminal backend: %{y:.1f} ms<extra></extra>"}, {"type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "x": [10, 100, 500], "y": [656.7, 540.37, 542.34], "marker": {"size": 8, "color": "#d97a4c"}, "line": {"color": "#d97a4c", "width": 2}, "hovertemplate": "iters=%{x}<br>Rust (luminal): %{y:.1f} ms<extra></extra>"}], {"title": "TTFT", "xaxis": {"title": "Search iterations", "type": "log", "tickvals": [10, 100, 500], "ticktext": ["10", "100", "500"]}, "yaxis": {"title": "ms", "rangemode": "tozero"}, "legend": {"orientation": "h", "y": -0.25}, "margin": {"t": 50, "b": 90}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script><div id="fig26" class="chart"></div><script>Plotly.newPlot("fig26", [{"type": "scatter", "mode": "lines+markers", "name": "luminal backend", "x": [10, 100, 500], "y": [1166.6714247548953, 1167.2746865515364, 1168.7990181031637], "marker": {"size": 8, "color": "#4c9ed9"}, "line": {"color": "#4c9ed9", "width": 2}, "hovertemplate": "iters=%{x}<br>luminal backend: %{y:.1f} ms<extra></extra>"}, {"type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "x": [10, 100, 500], "y": [59.6, 48.79, 48.88], "marker": {"size": 8, "color": "#d97a4c"}, "line": {"color": "#d97a4c", "width": 2}, "hovertemplate": "iters=%{x}<br>Rust (luminal): %{y:.1f} ms<extra></extra>"}], {"title": "TPOT", "xaxis": {"title": "Search iterations", "type": "log", "tickvals": [10, 100, 500], "ticktext": ["10", "100", "500"]}, "yaxis": {"title": "ms", "rangemode": "tozero"}, "legend": {"orientation": "h", "y": -0.25}, "margin": {"t": 50, "b": 90}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script><div id="fig27" class="chart"></div><script>Plotly.newPlot("fig27", [{"type": "scatter", "mode": "lines+markers", "name": "luminal backend", "x": [10, 100, 500], "y": [93.47603664599592, 132.266081985028, 298.05094401398674], "marker": {"size": 8, "color": "#4c9ed9"}, "line": {"color": "#4c9ed9", "width": 2}, "hovertemplate": "iters=%{x}<br>luminal backend: %{y:.1f} sec<extra></extra>"}, {"type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "x": [10, 100, 500], "y": [25.48138, 47.5342, 134.79345], "marker": {"size": 8, "color": "#d97a4c"}, "line": {"color": "#d97a4c", "width": 2}, "hovertemplate": "iters=%{x}<br>Rust (luminal): %{y:.1f} sec<extra></extra>"}], {"title": "Time to Search", "xaxis": {"title": "Search iterations", "type": "log", "tickvals": [10, 100, 500], "ticktext": ["10", "100", "500"]}, "yaxis": {"title": "sec", "rangemode": "tozero"}, "legend": {"orientation": "h", "y": -0.25}, "margin": {"t": 50, "b": 90}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script></div>
+  <table><thead><tr><th>Path</th><th>Config</th><th>TTFT</th><th>TPOT</th><th>Throughput</th><th>Compile</th><th>Prompt tokens</th><th>Note</th></tr></thead><tbody><tr><td>luminal backend</td><td>s=10</td><td>21002.7 ms</td><td>1166.7 ms</td><td>0.9 tok/s</td><td>93476 ms</td><td>19</td><td style="color:#777">sequential per-token, StaticCache KV cache</td></tr>
+<tr><td>Rust (luminal)</td><td>s=10</td><td>656.7 ms</td><td>59.6 ms</td><td>16.8 tok/s</td><td>25481 ms</td><td>—</td><td style="color:#777">sum of per-token prefill durations</td></tr>
+<tr><td>luminal backend</td><td>s=100</td><td>21018.7 ms</td><td>1167.3 ms</td><td>0.9 tok/s</td><td>132266 ms</td><td>19</td><td style="color:#777">sequential per-token, StaticCache KV cache</td></tr>
+<tr><td>Rust (luminal)</td><td>s=100</td><td>540.4 ms</td><td>48.8 ms</td><td>20.5 tok/s</td><td>47534 ms</td><td>—</td><td style="color:#777">sum of per-token prefill durations</td></tr>
+<tr><td>luminal backend</td><td>s=500</td><td>21034.4 ms</td><td>1168.8 ms</td><td>0.9 tok/s</td><td>298051 ms</td><td>19</td><td style="color:#777">sequential per-token, StaticCache KV cache</td></tr>
+<tr><td>Rust (luminal)</td><td>s=500</td><td>542.3 ms</td><td>48.9 ms</td><td>20.5 tok/s</td><td>134793 ms</td><td>—</td><td style="color:#777">sum of per-token prefill durations</td></tr></tbody></table>
+</section>
+</main>
+</body>
+</html>
--- a/benchmarks/ttft/run.py
+++ b/benchmarks/ttft/run.py
@@ -0,0 +1,683 @@
+"""TTFT + TPOT benchmark orchestrator.
+
+Runs four paths in isolated subprocesses:
+  1. python_baseline       — HuggingFace / PyTorch eager on CUDA
+  2. python_torch_compile  — torch.compile(model) inductor backend
+  3. python_luminal        — torch.compile(model, backend=luminal_backend)
+  4. rust                  — examples/<package> binary (luminal_cuda_lite)
+
+Use --config to select a named configuration, or --all-configs to run every
+entry in CONFIGS. All output is written to the SQLite bench DB
+(benchmarks/ttft/bench.db); the TUI / dashboard / report read from there.
+
+Notes on comparability:
+  - python_baseline: single chunked forward for TTFT; KV-cache decode for TPOT.
+  - python_torch_compile: inductor, same chunked prefill as baseline; first
+    call triggers JIT compilation (recorded separately as compile_ms).
+  - python_luminal: sequential per-token prefill with StaticCache; TPOT via
+    autoregressive decode steps.
+  - rust: sequential per-token prefill; TTFT = sum of prefill step durations.
+Steady-state execution only — compile / egraph-search time excluded from TTFT but
+recorded separately as compile_ms for all paths that support it.
+"""
+
+import argparse
+import datetime
+import json
+import os
+import re
+import subprocess
+import sys
+import time
+from pathlib import Path
+
+try:
+    import tomllib
+except ImportError:
+    try:
+        import tomli as tomllib  # type: ignore[no-redef]
+    except ImportError:
+        raise ImportError("Python 3.11+ or 'pip install tomli' required to load benchmarks.toml")
+
+import db
+
+BENCH_DIR = Path(__file__).resolve().parent
+REPO_ROOT = BENCH_DIR.parent.parent
+
+DEFAULT_PROMPT = "Explain what a neural network is in a paragraph."
+DEFAULT_MODEL = "NousResearch/Meta-Llama-3-8B-Instruct"
+
+_CONFIG_PATH = BENCH_DIR / "benchmarks.toml"
+with open(_CONFIG_PATH, "rb") as _f:
+    _BENCH_CONFIG = tomllib.load(_f)
+
+# Named benchmark configurations.  Each entry overrides any subset of the
+# CLI defaults; explicit CLI flags always take precedence over the config.
+CONFIGS: dict = _BENCH_CONFIG["configs"]
+UR_TEST_MODELS: list = _BENCH_CONFIG["ur_test"]["models"]
+SEARCH_SWEEP_ITERS: list = _BENCH_CONFIG["ur_test"]["search_sweep_iters"]
+
+SWEEP_CONFIG_PREFIX = "s="
+
+BENCH_LINE = re.compile(r"^BENCH_RESULT (.*)$", re.MULTILINE)
+RUST_TTFT_LINE = re.compile(r"TTFT:\s*([0-9]+\.?[0-9]*)\s*ms")
+RUST_TPOT_LINE = re.compile(r"TPOT:\s*([0-9]+\.?[0-9]*)\s*ms")
+RUST_COMPILE_LINE = re.compile(r"COMPILE:\s*([0-9]+\.?[0-9]*)\s*ms")
+RUST_PROMPT_LINE = re.compile(r"Prompt:\s*(\d+)\s*tokens")
+
+
+def _stream(proc, tee_prefix):
+    """Drain subprocess stdout, tee-ing to our stdout line-by-line. Returns full stdout."""
+    buf = []
+    assert proc.stdout is not None
+    for line in proc.stdout:
+        buf.append(line)
+        sys.stdout.write(f"[{tee_prefix}] {line}")
+        sys.stdout.flush()
+    proc.wait()
+    return "".join(buf)
+
+
+_MEM_LOG_PATH = os.environ.get("BENCH_MEM_LOG", "/tmp/bench_mem_snapshots.log")
+
+
+def _snapshot_memory(label: str) -> None:
+    """Append a host+GPU memory snapshot to BENCH_MEM_LOG. Cheap, never raises."""
+    try:
+        ts = datetime.datetime.now().isoformat(timespec="seconds")
+        meminfo_keys = ("MemTotal", "MemFree", "MemAvailable", "Cached", "Slab", "SReclaimable")
+        meminfo = {}
+        with open("/proc/meminfo") as f:
+            for line in f:
+                k, _, rest = line.partition(":")
+                if k in meminfo_keys:
+                    meminfo[k] = rest.strip().split()[0]  # kB
+        try:
+            gpu = subprocess.check_output(
+                ["nvidia-smi", "--query-gpu=memory.used,memory.free,memory.total",
+                 "--format=csv,noheader,nounits"],
+                stderr=subprocess.DEVNULL, text=True, timeout=5,
+            ).strip().splitlines()[0]
+        except Exception:
+            gpu = "n/a"
+        parent_rss = "?"
+        try:
+            with open(f"/proc/{os.getpid()}/status") as f:
+                for line in f:
+                    if line.startswith("VmRSS:"):
+                        parent_rss = line.split()[1]
+                        break
+        except Exception:
+            pass
+        host_str = " ".join(f"{k}={meminfo.get(k, '?')}kB" for k in meminfo_keys)
+        with open(_MEM_LOG_PATH, "a") as f:
+            f.write(f"{ts}  [{label}]  parent_rss={parent_rss}kB  {host_str}  gpu(used,free,total MiB)={gpu}\n")
+    except Exception as e:
+        sys.stderr.write(f"[mem-snapshot warn] {e}\n")
+
+
+def _cargo_env():
+    """Return env dict with ~/.cargo/bin prepended to PATH."""
+    cargo_bin = str(Path.home() / ".cargo" / "bin")
+    path = os.environ.get("PATH", "")
+    if cargo_bin not in path:
+        path = f"{cargo_bin}:{path}"
+    return {**os.environ, "PATH": path}
+
+
+def run_rust(_prompt, package="llama", env_vars=None):
+    print(f"\n=== Running: rust (examples/{package}) ===", flush=True)
+    cmd = ["cargo", "run", "--release", "-p", package]
+    env = _cargo_env()
+    if env_vars:
+        env.update(env_vars)
+    proc = subprocess.Popen(
+        cmd,
+        cwd=REPO_ROOT,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
+        bufsize=1,
+        env=env,
+    )
+    output = _stream(proc, "rust")
+    if proc.returncode != 0:
+        raise RuntimeError(f"rust bench failed with code {proc.returncode}")
+    m = RUST_TTFT_LINE.search(output)
+    if not m:
+        raise RuntimeError("could not find 'TTFT: X ms' in rust stdout")
+    ttft_ms = float(m.group(1))
+    result = {
+        "path": "rust",
+        "model": DEFAULT_MODEL,
+        "ttft_ms": ttft_ms,
+        "note": "sum of per-token prefill durations",
+    }
+    m_compile = RUST_COMPILE_LINE.search(output)
+    if m_compile:
+        result["compile_ms"] = float(m_compile.group(1))
+    m_tpot = RUST_TPOT_LINE.search(output)
+    if m_tpot:
+        tpot_ms = float(m_tpot.group(1))
+        result["tpot_ms"] = tpot_ms
+        result["throughput_tps"] = 1000.0 / tpot_ms
+    m_prompt = RUST_PROMPT_LINE.search(output)
+    if m_prompt:
+        result["prompt_tokens"] = int(m_prompt.group(1))
+    return result
+
+
+def run_python_script(name, extra_args):
+    script = BENCH_DIR / name
+    print(f"\n=== Running: {script.name} ===", flush=True)
+    cmd = [sys.executable, str(script), *extra_args]
+    proc = subprocess.Popen(
+        cmd,
+        cwd=REPO_ROOT,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
+        bufsize=1,
+        env={**os.environ},
+    )
+    output = _stream(proc, script.stem)
+    if proc.returncode != 0:
+        raise RuntimeError(f"{script.name} failed with code {proc.returncode}")
+    m = BENCH_LINE.search(output)
+    if not m:
+        raise RuntimeError(f"no BENCH_RESULT line in {script.name} output")
+    return json.loads(m.group(1))
+
+
+PATH_ORDER = ["python_baseline", "python_torch_compile", "python_luminal", "rust"]
+PATH_LABELS = {
+    "python_baseline": "Python\n(HF baseline)",
+    "python_torch_compile": "Python\n(torch.compile)",
+    "python_luminal": "Python → Rust\n(luminal_backend)",
+    "rust": "Rust\n(examples/llama)",
+}
+PATH_COLORS = {
+    "python_baseline": "#888888",
+    "python_torch_compile": "#5ab552",
+    "python_luminal": "#4c9ed9",
+    "rust": "#d97a4c",
+}
+
+
+def run_one_config(config_name, settings, global_skip, inter_path_cooldown=0):
+    """Run all four paths for one config. Returns list of result dicts tagged with 'config'."""
+    model = settings["model"]
+    rust_package = settings["rust_package"]
+    prompt = settings["prompt"]
+    iters = settings["iters"]
+    warmups = settings["warmups"]
+    decode_tokens = settings["decode_tokens"]
+    search_iters = settings["search_iters"]
+    dtype = settings.get("dtype", "float32")
+    skip = set(global_skip) | set(settings.get("skip", []))
+
+    common_py = [
+        "--model", model,
+        "--prompt", prompt,
+        "--iters", str(iters),
+        "--warmups", str(warmups),
+        "--decode-tokens", str(decode_tokens),
+        "--dtype", dtype,
+    ]
+    luminal_py = common_py + ["--search-iters", str(search_iters)]
+
+    rust_env = {"SEARCH_GRAPHS": str(search_iters), "PROMPT": prompt, "ITERS": str(iters)}
+
+    results = []
+    first_path = True
+    for path, fn in [
+        ("python_baseline",      lambda: run_python_script("bench_python_baseline.py", common_py)),
+        ("python_torch_compile", lambda: run_python_script("bench_python_torch_compile.py", common_py)),
+        ("python_luminal",       lambda: run_python_script("bench_python_luminal.py", luminal_py)),
+        ("rust",                 lambda: run_rust(prompt, package=rust_package, env_vars=rust_env)),
+    ]:
+        if path in skip:
+            continue
+        if not first_path and inter_path_cooldown > 0:
+            print(f"  [cooldown {inter_path_cooldown}s]", flush=True)
+            time.sleep(inter_path_cooldown)
+        first_path = False
+        _snapshot_memory(f"{config_name}/{path} BEFORE")
+        try:
+            r = fn()
+            r["config"] = config_name
+            r["model"] = model  # ensure correct model is always tagged
+            if path in ("python_luminal", "rust"):
+                r["search_iters"] = search_iters
+            results.append(r)
+        except Exception as e:
+            print(f"\n[WARN] {config_name}/{path} failed: {e}", flush=True)
+            results.append({
+                "path": path,
+                "config": config_name,
+                "model": model,
+                "error": str(e),
+                "ttft_ms": None,
+            })
+        _snapshot_memory(f"{config_name}/{path} AFTER")
+    return results
+
+
+def plot(results, out_path):
+    import matplotlib
+
+    matplotlib.use("Agg")
+    import matplotlib.pyplot as plt
+
+    # Group by config so each config gets its own subplot column.
+    configs_seen: list[str] = []
+    by_config: dict[str, dict] = {}
+    for r in results:
+        cfg = r.get("config", "default")
+        if cfg not in by_config:
+            configs_seen.append(cfg)
+            by_config[cfg] = {}
+        by_config[cfg][r["path"]] = r
+
+    has_tpot = any(
+        r.get("tpot_ms") is not None
+        for r in results
+        if not r.get("error")
+    )
+    nrows = 2 if has_tpot else 1
+    ncols = len(configs_seen)
+    fig, axes = plt.subplots(nrows, ncols, figsize=(6 * ncols, 4.5 * nrows), squeeze=False)
+
+    for col, cfg in enumerate(configs_seen):
+        by_path = by_config[cfg]
+        present = [p for p in PATH_ORDER if p in by_path]
+
+        def _bar(ax, title, ylabel, key):
+            raw = [by_path[p].get(key) for p in present]
+            ys = [v if v is not None else 0.0 for v in raw]
+            cs = [PATH_COLORS.get(p, "#aaaaaa") if raw[i] is not None else "#cccccc"
+                  for i, p in enumerate(present)]
+            xs = [PATH_LABELS.get(p, p) for p in present]
+            bars = ax.bar(xs, ys, color=cs)
+            ax.set_ylabel(ylabel)
+            ax.set_title(f"{title} — {cfg}")
+            ax.grid(axis="y", alpha=0.3)
+            for b, v in zip(bars, raw):
+                if v is not None:
+                    ax.text(b.get_x() + b.get_width() / 2, v, f"{v:.0f} ms",
+                            ha="center", va="bottom", fontsize=9)
+
+        _bar(axes[0][col], "TTFT", "Time to first token (ms)", "ttft_ms")
+        if has_tpot:
+            _bar(axes[1][col], "TPOT", "Time per output token (ms)", "tpot_ms")
+
+    fig.tight_layout()
+    fig.savefig(out_path, dpi=150)
+    print(f"wrote {out_path}")
+
+
+def run_ur_test(args, conn, run_id):
+    """The ur-test: all 4 paths at default budget + full search sweep, for each model.
+
+    Inserts each result into the DB as it is produced so a mid-run crash still
+    leaves partial data behind.
+    """
+    all_results = []
+
+    for model_idx, model_key in enumerate(UR_TEST_MODELS):
+        s = _settings_for_config(model_key, args)
+
+        if model_idx > 0:
+            print(f"\n  [cooldown 30s between models]", flush=True)
+            time.sleep(30)
+
+        # ── Phase 1: comparison — all 4 paths at the model's default search budget ──
+        print(f"\n{'='*60}\nUR-TEST COMPARISON: {model_key}\n{'='*60}", flush=True)
+        comp_results = run_one_config(model_key, s, args.skip, inter_path_cooldown=20)
+        for r in comp_results:
+            r["model_key"] = model_key
+            db.insert_result(conn, run_id, r)
+        conn.commit()
+        all_results.extend(comp_results)
+
+        # ── Phase 2: search sweep — python_luminal + rust across all budgets ──
+        if args.no_sweep:
+            continue
+        print(f"\n{'='*60}\nUR-TEST SWEEP: {model_key}\n{'='*60}", flush=True)
+        sweep_skip_base = set(args.skip) | {"python_baseline", "python_torch_compile"}
+        # Memory peak in egglog Search grows monotonically with search-iters.
+        # If a path SIGKILLs (-9) at budget N, every higher budget will too —
+        # skip it to avoid wasting another ~hour per model on guaranteed OOMs.
+        oom_paths: set[str] = set()
+        for n in SEARCH_SWEEP_ITERS:
+            print(f"  [cooldown 20s before s={n}]", flush=True)
+            time.sleep(20)
+            sweep_skip = list(sweep_skip_base | oom_paths)
+            if oom_paths:
+                print(f"  [skip-on-prior-OOM] {sorted(oom_paths)} OOM'd at lower budget; skipping at s={n}", flush=True)
+            sweep_s = {**s, "search_iters": n}
+            results_n = run_one_config(f"s={n}", sweep_s, sweep_skip, inter_path_cooldown=20)
+            for r in results_n:
+                r["model_key"] = model_key  # preserve ur-test model identity for dashboard
+                db.insert_result(conn, run_id, r)
+                if "code -9" in (r.get("error") or ""):
+                    oom_paths.add(r["path"])
+            conn.commit()
+            all_results.extend(results_n)
+
+    print("\nGenerate report with:")
+    print(f"  python3 benchmarks/ttft/gen_report.py --db benchmarks/ttft/bench.db --run {run_id} \\")
+    print("    --out benchmarks/ttft/report.html")
+    print("\nGenerate dashboard with:")
+    print("  python3 benchmarks/ttft/gen_dashboard.py --out benchmarks/ttft/dashboard.html")
+
+    return all_results
+
+
+def _git_info():
+    """Return (short_commit, branch) from the repo, or ('unknown', 'unknown') if unavailable."""
+    try:
+        commit = subprocess.check_output(
+            ["git", "rev-parse", "--short", "HEAD"],
+            cwd=REPO_ROOT, stderr=subprocess.DEVNULL, text=True,
+        ).strip()
+        branch = subprocess.check_output(
+            ["git", "rev-parse", "--abbrev-ref", "HEAD"],
+            cwd=REPO_ROOT, stderr=subprocess.DEVNULL, text=True,
+        ).strip()
+        return commit, branch
+    except Exception:
+        return "unknown", "unknown"
+
+
+def _gpu_info() -> dict:
+    """Return GPU metadata from nvidia-smi, or empty dict if unavailable."""
+    try:
+        out = subprocess.check_output(
+            [
+                "nvidia-smi",
+                "--query-gpu=name,driver_version,memory.total",
+                "--format=csv,noheader,nounits",
+            ],
+            stderr=subprocess.DEVNULL,
+            text=True,
+        ).strip()
+        if not out:
+            return {}
+        parts = [p.strip() for p in out.splitlines()[0].split(",")]
+        if len(parts) < 3:
+            return {}
+        return {
+            "gpu_name": parts[0],
+            "gpu_driver": parts[1],
+            "gpu_vram_mb": int(parts[2]),
+        }
+    except Exception:
+        return {}
+
+
+def _cuda_version() -> str:
+    """Return CUDA version string from nvidia-smi, or 'unknown'."""
+    try:
+        out = subprocess.check_output(
+            ["nvidia-smi", "--query", "--display=COMPUTE"],
+            stderr=subprocess.DEVNULL,
+            text=True,
+        )
+        for line in out.splitlines():
+            if "CUDA Version" in line:
+                return line.split(":")[-1].strip()
+    except Exception:
+        pass
+    try:
+        out = subprocess.check_output(
+            ["nvidia-smi"], stderr=subprocess.DEVNULL, text=True
+        )
+        import re as _re
+        m = _re.search(r"CUDA Version:\s*([\d.]+)", out)
+        if m:
+            return m.group(1)
+    except Exception:
+        pass
+    return "unknown"
+
+
+def _record_run(conn, mode):
+    """Insert a `runs` row capturing this orchestrator invocation. Returns run_id.
+
+    Uses microsecond resolution in the run_id so two invocations within the
+    same wallclock second never collide on the runs PRIMARY KEY (insert_run
+    defaults to OR IGNORE, which would otherwise silently merge them and
+    corrupt history). Microseconds also let the dashboard plot back-to-back
+    runs at distinct x-positions instead of stacking them on one date label.
+    """
+    now = datetime.datetime.now()
+    run_id = now.strftime("%Y-%m-%dT%H-%M-%S-%f")
+    commit, branch = _git_info()
+    db.insert_run(
+        conn,
+        run_id=run_id,
+        timestamp=now.isoformat(),
+        mode=mode,
+        git_commit=commit,
+        git_branch=branch,
+        cuda_version=_cuda_version(),
+        **_gpu_info(),
+    )
+    conn.commit()
+    return run_id
+
+
+def _settings_from_args(args):
+    """Build a settings dict from parsed CLI args."""
+    return {
+        "model": args.model,
+        "rust_package": args.rust_package,
+        "prompt": args.prompt,
+        "iters": args.iters,
+        "warmups": args.warmups,
+        "decode_tokens": args.decode_tokens,
+        "search_iters": args.search_iters,
+        "dtype": args.dtype,
+        "skip": [],
+    }
+
+
+def _settings_for_config(config_name, args):
+    """Merge CONFIGS[config_name] over CLI arg defaults."""
+    cfg = CONFIGS[config_name]
+    return {
+        "model":        cfg.get("model",        args.model),
+        "rust_package": cfg.get("rust_package", args.rust_package),
+        "prompt":       cfg.get("prompt",       args.prompt),
+        "iters":        cfg.get("iters",        args.iters),
+        "warmups":      cfg.get("warmups",      args.warmups),
+        "decode_tokens":cfg.get("decode_tokens",args.decode_tokens),
+        "search_iters": cfg.get("search_iters", args.search_iters),
+        "dtype":        cfg.get("dtype",        args.dtype),
+        "skip":         cfg.get("skip",         []),
+    }
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument(
+        "--config",
+        choices=list(CONFIGS),
+        default=None,
+        help="Named benchmark configuration. Sets parameter defaults; explicit flags override.",
+    )
+    ap.add_argument(
+        "--all-configs",
+        action="store_true",
+        dest="all_configs",
+        help="Run every entry in CONFIGS into a single run_id in the DB.",
+    )
+    ap.add_argument(
+        "--search-sweep",
+        action="store_true",
+        dest="search_sweep",
+        help=(
+            "Run python_luminal + rust across all SEARCH_SWEEP_ITERS budgets "
+            f"({SEARCH_SWEEP_ITERS}). Uses --config (default: llama-8b) as the base settings."
+        ),
+    )
+    ap.add_argument(
+        "--skip-configs",
+        nargs="*",
+        default=[],
+        choices=list(CONFIGS),
+        dest="skip_configs",
+        metavar="CONFIG",
+        help="Config names to exclude when using --all-configs.",
+    )
+    ap.add_argument(
+        "--no-sweep",
+        action="store_true",
+        dest="no_sweep",
+        help=(
+            "With --ur-test: skip the search-budget sweep phase and only run "
+            "the 4-path comparison for each model. ~1.5 hr instead of ~5 hr."
+        ),
+    )
+    ap.add_argument("--model", default=DEFAULT_MODEL)
+    ap.add_argument("--rust-package", default="llama", dest="rust_package",
+                    help="Cargo package name for the rust bench (examples/<name>).")
+    ap.add_argument("--prompt", default=DEFAULT_PROMPT)
+    ap.add_argument("--iters", type=int, default=3)
+    ap.add_argument("--warmups", type=int, default=1)
+    ap.add_argument("--skip", nargs="*", default=[],
+                    choices=["rust", "python_luminal", "python_baseline", "python_torch_compile"])
+    ap.add_argument("--out", default=str(BENCH_DIR / "ttft.png"))
+    ap.add_argument("--db", default=str(db.DEFAULT_DB_PATH),
+                    help="SQLite database file (default: benchmarks/ttft/bench.db).")
+    ap.add_argument("--run", default=None, dest="run",
+                    help="With --render-only: run_id to render (default: latest).")
+    ap.add_argument(
+        "--decode-tokens", type=int, default=50,
+        help="Tokens to generate for TPOT measurement (0 = skip TPOT).",
+    )
+    ap.add_argument(
+        "--search-iters", type=int, default=500,
+        help="Egraph search iterations for the python_luminal path.",
+    )
+    ap.add_argument(
+        "--dtype", default="float32",
+        choices=["float32", "bfloat16", "float16"],
+        help="Torch dtype for the python paths. Configs may override per-model.",
+    )
+    ap.add_argument(
+        "--render-only", action="store_true",
+        help="Skip running benches; render an existing run from the DB. "
+             "Use --run RUN_ID to pick a specific run, otherwise the latest is used.",
+    )
+    ap.add_argument(
+        "--ur-test", action="store_true", dest="ur_test",
+        help=(
+            f"The mega-test: run all 4 paths at default budget + full search sweep "
+            f"({SEARCH_SWEEP_ITERS}) for each of {UR_TEST_MODELS}."
+        ),
+    )
+
+    # Pre-parse to apply named config as argparse defaults so explicit CLI
+    # flags still override them.
+    pre, _ = ap.parse_known_args()
+    if pre.config and not (pre.all_configs or getattr(pre, "search_sweep", False)):
+        cfg = CONFIGS[pre.config]
+        ap.set_defaults(**{k: v for k, v in cfg.items() if k not in ("skip",)})
+    args = ap.parse_args()
+    if pre.config and not args.all_configs and not args.search_sweep:
+        for path in CONFIGS[pre.config].get("skip", []):
+            if path not in args.skip:
+                args.skip.append(path)
+
+    conn = db.connect(args.db)
+
+    if args.render_only:
+        run_id = args.run or db.latest_run_id(conn)
+        if run_id is None:
+            sys.exit(f"--render-only: no runs found in {args.db}")
+        results = db.load_results(conn, run_id)
+        if not results:
+            sys.exit(f"--render-only: no results found for run {run_id} in {args.db}")
+        print(f"rendering run {run_id} ({len(results)} results)")
+    else:
+        mode = (
+            ("ur-test-fast" if args.no_sweep else "ur-test") if args.ur_test
+            else "search-sweep" if args.search_sweep
+            else "all-configs"  if args.all_configs
+            else "single"
+        )
+        run_id = _record_run(conn, mode)
+        print(f"run_id: {run_id}  →  {args.db}")
+
+        if args.ur_test:
+            results = run_ur_test(args, conn, run_id)
+        elif args.search_sweep:
+            results = []
+            # Base settings come from --config (default: llama-8b) or bare CLI args.
+            base = (
+                _settings_for_config(args.config, args)
+                if args.config
+                else _settings_for_config("llama-8b", args)
+            )
+            sweep_skip = set(args.skip) | {"python_baseline", "python_torch_compile"}
+            for i, n in enumerate(SEARCH_SWEEP_ITERS):
+                if i > 0:
+                    print(f"  [cooldown 20s — letting CUDA free previous model memory]", flush=True)
+                    time.sleep(20)
+                print(f"\n{'='*60}\nSEARCH SWEEP: s={n}\n{'='*60}", flush=True)
+                s = {**base, "search_iters": n}
+                rs = run_one_config(f"s={n}", s, list(sweep_skip))
+                for r in rs:
+                    db.insert_result(conn, run_id, r)
+                conn.commit()
+                results.extend(rs)
+        elif args.all_configs:
+            results = []
+            for config_name in CONFIGS:
+                if config_name in args.skip_configs:
+                    continue
+                print(f"\n{'='*60}\nCONFIG: {config_name}\n{'='*60}", flush=True)
+                settings = _settings_for_config(config_name, args)
+                rs = run_one_config(config_name, settings, args.skip)
+                for r in rs:
+                    db.insert_result(conn, run_id, r)
+                conn.commit()
+                results.extend(rs)
+        else:
+            config_name = args.config or "default"
+            settings = (
+                _settings_for_config(args.config, args)
+                if args.config
+                else _settings_from_args(args)
+            )
+            results = run_one_config(config_name, settings, args.skip)
+            for r in results:
+                db.insert_result(conn, run_id, r)
+            conn.commit()
+
+    # Summary
+    configs_in_results = list(dict.fromkeys(r.get("config", "default") for r in results))
+    for cfg in configs_in_results:
+        group = [r for r in results if r.get("config", "default") == cfg]
+        print(f"\nSummary ({cfg}):")
+        for r in group:
+            if r.get("error"):
+                print(f"  {r['path']:>22}:  FAILED — {r['error']}")
+                continue
+            if r.get("ttft_ms") is None:
+                print(f"  {r['path']:>22}:  no data")
+                continue
+            compile_ms = r.get("compile_ms")
+            compile_str = f"  compile {compile_ms:.0f} ms" if compile_ms is not None else ""
+            tpot = r.get("tpot_ms")
+            tput = r.get("throughput_tps")
+            tpot_str = f"  TPOT {tpot:.2f} ms  ({tput:.1f} tok/s)" if tpot is not None else ""
+            print(f"  {r['path']:>22}:  TTFT {r['ttft_ms']:.2f} ms{compile_str}{tpot_str}")
+
+    plot(results, args.out)
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks/ttft/run.sh
+++ b/benchmarks/ttft/run.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+# TTFT benchmark entrypoint. Runs via uv against the luminal_python venv.
+set -e
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+REPO_ROOT="$( cd "$SCRIPT_DIR/../.." && pwd )"
+cd "$REPO_ROOT/crates/luminal_python"
+exec uv run python "$SCRIPT_DIR/run.py" "$@"
--- a/benchmarks/ttft/ttft.png
+++ b/benchmarks/ttft/ttft.png
--- a/crates/luminal_cuda_lite/src/kernel/hlir.rs
+++ b/crates/luminal_cuda_lite/src/kernel/hlir.rs
@@ -1200,7 +1200,25 @@ impl KernelOp for KernelScatter {

        // Single-kernel scatter: copy dest→output then scatter src→output[indexes]
        // Launched as 1 block of 1024 threads with __syncthreads() barrier.
-        // Uses float4 vectorized copy (4x throughput) for the copy phase.
+        // Uses float4 vectorized copy (16 bytes per op) for the copy phase.
+        //
+        // The number of dtype elements that fit in a float4 (16 bytes) depends
+        // on the element size. Computing `n_vec = n_dest / 4` would only be
+        // correct for 4-byte dtypes — for bf16 it walks 2× past the end of
+        // `out`, producing CUDA_ERROR_ILLEGAL_ADDRESS once the OOB region
+        // happens to land on an unmapped page.
+        let elements_per_vec: usize = match self.dtype {
+            DType::F64 => 2,
+            DType::F32 | DType::Int => 4,
+            DType::F16 | DType::Bf16 | DType::I16 | DType::U16 => 8,
+            DType::Bool
+            | DType::I8
+            | DType::U8
+            | DType::F8UE8M0
+            | DType::F8E4M3
+            | DType::F8E5M2 => 16,
+            other => panic!("Unsupported dtype for scatter vectorization: {other:?}"),
+        };
        let n_src_elements = self
            .index_shape
            .iter()
@@ -1225,15 +1243,17 @@ extern \"C\" {{
        int tid = threadIdx.x;
        long long n_dest = {n_dest_elements};
        long long n_src = {n_src_elements};
-        // Phase 1: vectorized copy dest → output (float4 = 4 elements per op)
-        long long n_vec = n_dest / 4;
+        // Phase 1: vectorized copy dest → output (float4 = 16 bytes / iter,
+        // i.e. {elements_per_vec} {dtype} elements). n_vec is sized so the
+        // total bytes covered (`n_vec * 16`) never exceed `n_dest * sizeof({dtype})`.
+        long long n_vec = n_dest / {elements_per_vec};
        float4 *out4 = (float4 *)out;
        const float4 *dest4 = (const float4 *)dest;
        for (long long i = tid; i < n_vec; i += blockDim.x) {{
            out4[i] = dest4[i];
        }}
-        // Handle remaining elements
-        long long remainder_start = n_vec * 4;
+        // Handle remaining elements (the dtype-tail past the last full float4).
+        long long remainder_start = n_vec * {elements_per_vec};
        for (long long i = remainder_start + tid; i < n_dest; i += blockDim.x) {{
            out[i] = dest[i];
        }}
@@ -2060,7 +2080,7 @@ extern \"C\" {{
    __global__ void recip_k({dtype} *out, const {dtype} *in{dyn_dims_param}) {{
        long long const_z = (long long)blockIdx.x * blockDim.x + threadIdx.x;
        if (const_z >= {n_elements}) return;
-        out[{out_idx}] = 1.0f / in[{in_idx}];
+        out[{out_idx}] = ({dtype})1.0f / in[{in_idx}];
    }}
 }}"
        );
--- a/crates/luminal_cuda_lite/src/kernel/to_host.rs
+++ b/crates/luminal_cuda_lite/src/kernel/to_host.rs
@@ -7,7 +7,8 @@ use std::cell::RefCell;
 use std::sync::Arc;

 use cudarc::driver::{
-    CudaFunction, CudaModule, CudaSlice, CudaStream, DevicePtr, sys::CUgraphNode,
+    CudaFunction, CudaModule, CudaSlice, CudaStream, DevicePtr,
+    sys::{CUgraphNode, CUresult, cuLaunchKernel},
 };
 use itertools::Itertools;
 use luminal::{
@@ -275,6 +276,14 @@ impl CudaGraphOp {
        buffers: &FxHashMap<NodeIndex, &CudaSlice<u8>>,
        dyn_map: &FxHashMap<char, usize>,
    ) -> anyhow::Result<()> {
+        // Debug path: launch each kernel sequentially with sync between, so the
+        // failing kernel surfaces instead of the generic "CudaGraph" panic.
+        // Enable via `LUMINAL_DEBUG_SEQ=1`. Slow — only for diagnosing
+        // CUDA_ERROR_ILLEGAL_ADDRESS / NaN / wrong-output bugs in graph batching.
+        if std::env::var("LUMINAL_DEBUG_SEQ").is_ok() {
+            return self.execute_sequential_for_debug(stream, buffers, dyn_map);
+        }
+
        let mut state = self.state.borrow_mut();
        let _span = span!(Level::TRACE, "cuda_graph", kernels = state.kernels.len()).entered();

@@ -447,6 +456,152 @@ impl CudaGraphOp {
        Ok(())
    }

+    /// Diagnostic path for kernel-level errors that surface as a generic
+    /// `CUDA_ERROR_ILLEGAL_ADDRESS` panic from the batched cuda_graph_exec
+    /// launch. Bypasses CUDA-graph batching entirely: builds params per
+    /// kernel and launches each via `cuLaunchKernel`, syncing afterwards so
+    /// the offending kernel reports itself instead of being hidden inside
+    /// the graph's atomic launch.
+    ///
+    /// Enabled via `LUMINAL_DEBUG_SEQ=1`. ~10–100× slower than the graph
+    /// path; not for production.
+    fn execute_sequential_for_debug(
+        &self,
+        stream: &Arc<CudaStream>,
+        buffers: &FxHashMap<NodeIndex, &CudaSlice<u8>>,
+        dyn_map: &FxHashMap<char, usize>,
+    ) -> anyhow::Result<()> {
+        let mut state = self.state.borrow_mut();
+        let num_kernels = state.kernels.len();
+
+        // Allocate dyn_dims_buffer if needed and copy current values.
+        if !self.dyn_dims_order.is_empty() && state.dyn_dims_buffer.is_none() {
+            state.dyn_dims_buffer = Some(stream.alloc_zeros::<i32>(self.dyn_dims_order.len())?);
+        }
+        if !self.dyn_dims_order.is_empty() {
+            let values: Vec<i32> = self
+                .dyn_dims_order
+                .iter()
+                .map(|d| dyn_map.get(d).copied().unwrap_or(0) as i32)
+                .collect();
+            if let Some(buf) = state.dyn_dims_buffer.as_mut() {
+                stream.memcpy_htod(&values, buf)?;
+            }
+        }
+        let dyn_dims_ptr = state
+            .dyn_dims_buffer
+            .as_ref()
+            .map(|buf| buf.device_ptr(stream).0)
+            .unwrap_or(0);
+
+        // Collect buffer pointers (mirrors the graph path).
+        let mut buffer_ptrs: FxHashMap<NodeIndex, u64> = FxHashMap::default();
+        for &node in &self.buffer_nodes {
+            if let Some(buf) = buffers.get(&node) {
+                buffer_ptrs.insert(node, buf.device_ptr(stream).0);
+            }
+        }
+        for kernel in state.kernels.iter() {
+            if let Some(input_idx) = kernel.kernel_op.output_aliases_input()
+                && let Some(&input_ptr) = buffer_ptrs.get(&kernel.inputs[input_idx])
+            {
+                buffer_ptrs.insert(kernel.node, input_ptr);
+            }
+        }
+
+        // Allocate internal buffers + run pre_execute for every kernel up front.
+        for idx in 0..num_kernels {
+            let kernel = &mut state.kernels[idx];
+            if kernel.internal_bufs.is_empty() {
+                kernel.internal_bufs = kernel.kernel_op.allocate_internal_buffers(stream, dyn_map);
+            }
+            kernel.kernel_op.pre_execute(
+                stream,
+                &mut kernel.internal_bufs,
+                &mut kernel.constants,
+                &buffer_ptrs,
+                dyn_map,
+            );
+        }
+
+        let cu_stream = stream.cu_stream();
+
+        for idx in 0..num_kernels {
+            let kernel = &state.kernels[idx];
+            let kernel_name = kernel.kernel_op.kernel_name();
+            let node = kernel.node;
+
+            let grid = (
+                kernel.grid.0.exec(dyn_map).unwrap() as u32,
+                kernel.grid.1.exec(dyn_map).unwrap() as u32,
+                kernel.grid.2.exec(dyn_map).unwrap() as u32,
+            );
+            let block = (
+                kernel.block.0.exec(dyn_map).unwrap() as u32,
+                kernel.block.1.exec(dyn_map).unwrap() as u32,
+                kernel.block.2.exec(dyn_map).unwrap() as u32,
+            );
+            let shared_mem = kernel.shared_mem.exec(dyn_map).unwrap() as u32;
+
+            let output_ptr = buffer_ptrs.get(&node).copied().unwrap_or(0);
+            let input_ptrs: Vec<u64> = kernel
+                .inputs
+                .iter()
+                .map(|inp| buffer_ptrs.get(inp).copied().unwrap_or(0))
+                .collect();
+
+            let param_values = kernel.kernel_op.build_params(
+                stream,
+                output_ptr,
+                &input_ptrs,
+                &kernel.internal_bufs,
+                dyn_dims_ptr,
+            );
+            let mut params = UnifiedKernelParams::new(param_values);
+            let cu_func = unsafe { kernel.function.raw_function() };
+
+            let result = unsafe {
+                cuLaunchKernel(
+                    cu_func,
+                    grid.0,
+                    grid.1,
+                    grid.2,
+                    block.0,
+                    block.1,
+                    block.2,
+                    shared_mem,
+                    cu_stream,
+                    params.as_cuda_params(),
+                    std::ptr::null_mut(),
+                )
+            };
+            if result != CUresult::CUDA_SUCCESS {
+                eprintln!(
+                    "[seq-debug] kernel #{idx}/{num_kernels} '{kernel_name}' \
+                     node={node:?} grid={grid:?} block={block:?} \
+                     output_ptr={output_ptr:#x} inputs={input_ptrs:#x?} \
+                     LAUNCH FAILED: {result:?}"
+                );
+                anyhow::bail!(
+                    "kernel #{idx} '{kernel_name}' (node {node:?}) launch failed: {result:?}"
+                );
+            }
+            if let Err(e) = stream.synchronize() {
+                eprintln!(
+                    "[seq-debug] kernel #{idx}/{num_kernels} '{kernel_name}' \
+                     node={node:?} grid={grid:?} block={block:?} \
+                     output_ptr={output_ptr:#x} inputs={input_ptrs:#x?} \
+                     SYNC FAILED: {e}"
+                );
+                anyhow::bail!(
+                    "kernel #{idx} '{kernel_name}' (node {node:?}) sync failed: {e}"
+                );
+            }
+        }
+
+        Ok(())
+    }
+
    /// Build the CUDA graph from compiled kernels.
    fn build_graph(
        &self,
--- a/crates/luminal_python/LessonsLearned.md
+++ b/crates/luminal_python/LessonsLearned.md
@@ -749,6 +749,92 @@ candidates rejected" during search, check whether the rejection is from actual f
 or from dtype misinterpretation — the key diagnostic is whether the NaN pattern is
 identical across all attempts (dtype issue) vs varying (actual numerical issue).

+## 2026-04-22 — Benchmark python_luminal Path: NativeRuntime Panic on CUDA Weights
+
+### What the symptom was
+
+Running `benchmarks/ttft/run.py` with the `python_luminal` path panicked deep in Rust:
+
+```
+thread panicked at src/hlir.rs:2239:40: no entry found for key
+```
+
+The panic occurred in `NativeRuntime::execute` when the `Output` node tried to read its
+predecessor's buffer from `self.buffers` — and the buffer wasn't there.
+
+### What the actual root cause was
+
+The luminal Python wheel was built without `--features cuda` (plain `maturin build --release`).
+This means `_cuda_lite_factory_capsule` is not compiled into the `.so` file. In `main.py`,
+`_detect_factory_capsule` catches the resulting `ImportError` and **silently** falls back to
+`_native_factory_capsule` (NativeRuntime / CPU runtime).
+
+The benchmark model (`LlamaForCausalLM.from_pretrained(...).to("cuda")`) has all weights as
+CUDA device pointers. `BackendCompileArgs.device_ptrs` is populated with these GPU pointers.
+NativeRuntime has no mechanism to handle GPU-resident weight data — the `device_ptrs` map is
+simply ignored. After search completes (it can search because it uses dummy CPU data during
+profiling), the first real `execute()` call processes the graph:
+
+1. `Input` nodes are skipped (their buffers should be pre-populated by `set_input_from_ptr`)
+2. Weight `Input` nodes were set via `set_input_device_ptr` — but NativeRuntime's
+   `set_input_device_ptr` likely no-ops or stores garbage, leaving those buffers empty
+3. The `Output` node looks up its predecessor's buffer → key not found → panic
+
+### Why it was hard to find
+
+1. **Silent fallback**: `_detect_factory_capsule` catches `ImportError` without logging a
+   warning. Nothing in stdout indicates you're running on CPU when the model is on GPU.
+2. **Search succeeds**: The e-graph search runs to completion (searches 1 group, 1 chunk in
+   ~15s) because it uses 1.0f32 dummy data that doesn't need GPU. The failure only occurs at
+   first real execution.
+3. **Misleading error site**: `hlir.rs:2239` is in NativeRuntime's buffer-copy loop for Output
+   nodes — it gives no indication that the root cause is a missing CUDA feature flag at build time.
+4. **Backtrace required**: Without `RUST_BACKTRACE=1`, only the panic message is visible;
+   the `NativeRuntime` frame that reveals the CPU fallback is hidden.
+
+### The fix
+
+Rebuild the wheel with CUDA support:
+```bash
+maturin build --release --features cuda
+pip install target/wheels/luminal_python-*.whl --force-reinstall
+```
+
+Or via the test runner: `./run_tests_cuda.sh` uses `maturin develop --features cuda -r`.
+
+Consider adding an explicit warning or error in `_detect_factory_capsule` when CUDA inputs are
+detected but no CUDA factory is available:
+
+```python
+if device.type == "cuda":
+    try:
+        from .luminal import _cuda_lite_factory_capsule
+        return _cuda_lite_factory_capsule()
+    except ImportError:
+        import warnings
+        warnings.warn(
+            "CUDA inputs detected but luminal was built without --features cuda. "
+            "Falling back to NativeRuntime (CPU) — this will likely panic at runtime.",
+            RuntimeWarning,
+            stacklevel=3,
+        )
+```
+
+### The regression test
+
+`test_hf_llama3_8b_instruct_1layer` in `tests/test_llama3.py` — tests the exact architecture
+from the benchmark (Meta-Llama-3-8B-Instruct, 4096 hidden, 32 attn heads, 8 KV heads) with
+1 layer and random weights. This test passes with `--features cuda` and panics without it.
+
+### General principle
+
+**When a feature gate silently changes the runtime backend, assert that the selected backend
+is compatible with the input device.** A CUDA tensor flowing into a CPU-only runtime is always
+a programming error, not a graceful degradation. The failure should surface at factory
+selection time (with a clear error message), not deep in a Rust buffer-copy loop.
+
+---
+
 ## 2026-03-25 — KernelExp/KernelSigmoid: Fused CUDA Kernels for Precision

 1. **Symptom**: `test_hf_llama3_full` (16-layer Llama-3.2-1B) had ~1e-4 max diff vs PyTorch.
@@ -757,6 +843,44 @@ identical across all attempts (dtype issue) vs varying (actual numerical issue).
 4. **Fix**: Added `KernelExp` (uses `expf()`), `KernelSigmoid` (uses `1/(1+expf(-x))`), and Kahan summation in SumReduce. Each uses both `kernel_rewrite` and a direct egglog pattern match with range checks (e.g., `(> ?val 1.44) (< ?val 1.45)`) to bypass constant format dependency.
 5. **Principle**: When decomposed CUDA kernel chains cause precision loss, add fused kernels via `kernel_rewrite`. For robustness, add BOTH the logical-op rewrite path AND a direct HLIR pattern match — the constant format in egglog can be fragile.

+---
+
+## 2026-04-23 — NativeRuntime Multi-Call Panic: Input Buffers Cleared After Each Run
+
+1. **Symptom**: The compiled model panicked with `hlir.rs:XXXX: no entry found for key` on the second call. First call succeeded; subsequent calls failed.
+2. **Root cause**: `NativeRuntime::execute` in `src/hlir.rs` called `self.buffers.retain(|k, _| output_nodes.contains(k))` after each run to free intermediate buffers. This correctly pruned temporary buffers but also pruned the Input-node buffers that hold model weights — so on the second call, the weight tensors were gone.
+3. **Why hard**: The bug never manifested in the test suite because every test called the compiled model exactly once per compile. The issue only appeared when running a bench loop that called the model multiple times. The panic location (deep in buffer lookup) gave no indication that the root cause was in the buffer retention policy.
+4. **Fix**: Changed the retain predicate to keep both `Output` and `Input` nodes:
+   ```rust
+   let keep_nodes = graph.node_indices()
+       .filter(|n| is::<Output> || is::<Input>)
+       .collect();
+   self.buffers.retain(|k, _| keep_nodes.contains(k));
+   ```
+5. **Principle**: When buffer lifetime policies are changed to free memory after a run, always verify that *persistent* state (model weights stored in Input nodes) is excluded from the cleanup sweep. A test that compiles + calls once per test function will never catch a multi-call regression — add a dedicated multi-call test for any compiled runtime.
+
+---
+
+## 2026-04-23 — PT2 USER_INPUT_MUTATION Outputs Confuse Dynamo Caller
+
+1. **Symptom**: With `StaticCache`, the compiled model returned `[1]` (cumulative_length update) instead of `[1, vocab_size]` logits. The wrong tensor was silently mapped to the output variable.
+2. **Root cause**: When `torch.export` encounters in-place mutations to input tensors (KV cache updates via `index_copy_`), it lifts them as `USER_INPUT_MUTATION` output specs, placed *before* the actual `USER_OUTPUT` logits in `ep.graph_signature.output_specs`. The compiled model returned all outputs; dynamo mapped index 0 (the mutation) to the first return value.
+3. **Why hard**: The output shape `[1]` from `cumulative_length` looked like a valid (though wrong) output. No error was raised — just wrong logits. Required inspecting `ep.graph_signature.output_specs` and understanding the ordering convention for different `OutputKind` values.
+4. **Fix**: In `pt2_backend`, parse `output_specs` to build a `mutation_mappings` list and `user_output_indices`. Wrap the compiled model to: (a) copy mutation outputs back into the corresponding input tensors, and (b) return only the `USER_OUTPUT` tensors.
+5. **Principle**: After `torch.export(...).run_decompositions()`, always inspect `ep.graph_signature.output_specs` when the model has in-place operations (KV cache, BN running stats). The output ordering is: mutations first, then actual outputs — and the caller only expects actual outputs.
+
+---
+
+## 2026-04-23 — CUDA Version Mismatch: torch+cuXXX Must Match System Driver
+
+1. **Symptom**: `torch.cuda.is_available()` returned `False` despite `nvidia-smi` showing a GPU. Warning: "CUDA initialization: The NVIDIA driver on your system is too old (found version 12080)."
+2. **Root cause**: `torch==2.11.0+cu130` requires CUDA 13.0 which needs driver >= 575. The system has driver 570 (CUDA 12.8 max). The mismatch caused silent CPU fallback — no error, just False from `is_available()`.
+3. **Why hard**: The bench appeared to start successfully (model loaded, compilation ran) but produced no results because it was running an 8B model on CPU. Zero output with exit code 0 looked like a hang or silent crash.
+4. **Fix**: Installed `torch==2.11.0+cu128` from `https://download.pytorch.org/whl/cu128`. CUDA 12.8 matches driver 570. Also needed matching `torchvision==0.26.0+cu128` and the `nvidia-cusparselt-cu12` runtime library.
+5. **Principle**: Before running any CUDA-dependent bench or test, verify `torch.cuda.is_available()` returns `True`. Check `nvidia-smi` CUDA Version field against the `+cuXXX` suffix in `torch.__version__` — they must match (CUDA runtime ≤ driver's max supported version). Never assume CPU fallback "works" for large model benchmarks.
+
+---
+
 ## 2026-04-26 — Loop unroll-union rules silently disabled in full egglog stage

 1. **Symptom**: Python `test_llama_transformer_block` (CUDA backend) produced output ~1e-2 off from PyTorch (atol=1e-4) on the `loop_rolling` branch. All component tests (RMSNorm, attention, SwiGLU, RoPE) passed. The diff pattern was suspicious: row 0 of the (1,4,32) output matched exactly, rows 1–3 differed slightly. Disabling rolling fixed it.
@@ -767,6 +891,8 @@ identical across all attempts (dtype issue) vs varying (actual numerical issue).
 4. **Fix**: Register `binary_op_unroll_rules` in BOTH `early_rewrites()` (so fusion patterns like GLUMoE can match before the early-stage extract, which is what fixed `test_glumoe_gemma_gelu_matches_unfused_output` earlier in the session) AND `rewrites()` (so kernel-level rewrites like `direct-exp-fusion` can match in the full stage on the unrolled chain). One block per binary op (`Add`, `Mul`, `Mod`, `LessThan`).
 5. **Principle**: When egglog has multiple stages (early/full) with disjoint rule sets, any rewrite that materialises new HLIR/IR enodes (rather than just lowering to LLIR) needs to fire in BOTH stages if downstream rewrites in BOTH stages might want to see the new structure. Putting "preparatory" rewrites only in `early_rewrites` means their effect is lost across the early→full handoff. The narrow rule of thumb: if your rule's outputs are intended to enable matches by other rules, audit which stages those other rules run in and register accordingly.

+---
+
 ## 2026-04-26 — `unroll_loops_in_llir` panicked on iteration-invariant body producers

 1. **Symptom**: Modal CI/CD job for the gemma example panicked at `src/graph.rs:1867` with `no entry found for key`. The line is `clone_map[i - 1][&body_producer]` inside `unroll_loops_in_llir`'s `resolve_src` closure — `body_producer` (the LoopEnd's incoming source for that slot) wasn't a key in the per-iteration clone map. cuda_lite/python tests didn't repro: only triggered by the specific genome and graph shapes that gemma's longer search settles on.
@@ -775,6 +901,8 @@ identical across all attempts (dtype issue) vs varying (actual numerical issue).
 4. **Fix**: in `unroll_loops_in_llir::resolve_src`, when the LoopStart-resolved `body_producer` isn't in `body_nodes`, return `body_producer` itself for iter > 0 instead of indexing `clone_map[i - 1]`. The body op didn't depend on the loop variable, so every iter > 0 carries the same value forward — using `body_producer` directly is semantically correct. Mirrored the same `unwrap_or(body_producer)` fallback in the post-loop substitution map (`marker_post_sub` for LoopEnd / LoopOutputSelect). Added a backward-walk-from-end-markers backfill in `collapse_loops_to_first_iter` so its body-node iteration also covers these nodes (it doesn't have a clone_map, but does need to rewire body ops' incoming edges before deleting markers).
 5. **Principle**: When a graph-walk-derived set is used as a hashmap key requirement, every code path that *could* produce a key outside that set needs a graceful fallback — not just a defensive `expect`. For loop unrolling specifically, the rule is: `body_nodes` is the set of "ops that participate in per-iter computation"; ops on the LoopEnd's path that *don't* participate (iteration-invariant) are still legitimate, and need a "no clone, share across iters" path through `resolve_src` and `marker_post_sub`. Forward-walk-only `body_nodes` is correct only when extraction never produces iteration-invariant body producers — and in an egglog-driven search, that's not a guarantee you can make.

+---
+
 ## 2026-04-26 — Iteration-invariant state slots are a first-class concept, not a defensive fallback

 1. **Symptom + fix recap**: gemma Modal CI panicked at `clone_map[i-1][&body_producer]` because some state slots' `body_producer` (LoopEnd's incoming) isn't in `body_nodes` (forward walk from input markers). The first commit pair (16de9638 / 93fb02c4) caught this with `.unwrap_or(body_producer)` — which works but reads as "defensive, unclear *why* this case exists."
@@ -782,3 +910,143 @@ identical across all attempts (dtype issue) vs varying (actual numerical issue).
 3. **Why "defensive fallback" framing is misleading**: it implies the LLIR is broken. It isn't. The forward-walk-only `body_nodes` definition just doesn't cover this case, because the case requires no per-iter cloning at all. A *node not reachable from any loop input marker has no input-marker ancestor*, so by construction its value doesn't depend on the loop's per-iter state.
 4. **Cleaner formulation**: name the concept. Compute an `iteration_invariant_slots: HashSet<LoopStart>` set at the same time `start_meta` is built, with the rule `body_producer ∉ body_nodes ⇒ iteration_invariant`. `resolve_src` and `marker_post_sub` then have explicit branches: if the slot is invariant, use `body_producer` directly; otherwise the standard per-iter clone lookup. The behavior is the same as the `unwrap_or` band-aid, but the code now documents that this is a real, sound case the unroll handles correctly — not a panic suppressor.
 5. **Principle**: when an `unwrap_or` papers over a case that turns out to be semantically valid, the right cleanup isn't to keep the `unwrap_or` and add a comment — it's to name the case. Hoist the predicate into a set or enum and branch on it explicitly. The compiler then enforces that every consumer of the per-iter cloning machinery has an opinion on iteration-invariant slots, instead of silently relying on a `Map::get` returning `None` at the right moment.
+
+---
+
+## 2026-04-30 — `translate_grouped_mm` casted the full expert weight to F32, OOMing search on Qwen3-MoE
+
+### What the symptom was
+
+`benchmarks/ttft/run.py --config qwen3-moe` crashed every search-profile attempt with:
+```
+crates/luminal_cuda_lite/src/runtime.rs:711: called `Result::unwrap()` on an `Err` value:
+  DriverError(CUDA_ERROR_OUT_OF_MEMORY, "out of memory")
+```
+The DB shows this had been failing every run for ~2 weeks. The rust `examples/qwen3_moe` ran fine end-to-end. python_baseline / python_torch_compile / qwen3-4b were all fine — only python_luminal × qwen3-moe failed.
+
+### What the actual root cause was
+
+`translate_grouped_mm` in `crates/luminal_python/rust/src/translator/tensor.rs` was lowering HF's `_grouped_mm(input, weight, offs)` op to a *full-broadcast* batched matmul plus a group-mask:
+
+```rust
+let weight_f = weight.cast(DType::F32);                  // [G=128, K, N] cast → 1.5 GB / layer
+let input_batched = input_f.expand_dim(0, g);
+let all_out = input_batched.matmul(weight_f);            // [G, S, N]
+let mask = ... (g_arange == expert_id).cast(F32);
+let out = (all_out * mask.expand_dim(2, n)).sum(0);      // mask + sum over G
+```
+
+The full `[G, K, N]` F32 cast intermediate is 1.5 GB / layer for gate-up and 0.6 GB / layer for down on Qwen3-30B-A3B. With 60 GB of persistent bf16 weights already on a 97 GB GPU, the search-time profiler ran out of memory allocating those casts.
+
+By contrast, `examples/qwen3_moe`'s `gather_experts` gathers only the top-K active experts per token first, then casts that small `[s, k, d1, d2]` slice (~100 MB / layer). The GLUMoE host op (`crates/luminal_cuda_lite/src/host/moe/glumoe_rewrite.egg`) is also wired to this gather pattern.
+
+### Why it was hard to find
+
+1. **Code path was reasonable in isolation**: at small scale (`test_grouped_mm_fallback`: g=2, K=8, N=16) the broadcast version was fine — the F32 cast was only 1 KB, and search profiling never noticed.
+2. **The error reported "out of memory" but the rest of the system looked healthy**: 60 GB weights + 37 GB headroom looks like plenty until you realise 48 layers × 2.1 GB cast intermediates per layer doesn't fit, even after loop rolling.
+3. **The DB's `code 1` failures looked the same as a Python exception** — the actual panic site (`runtime.rs:711:64` `stream.alloc_zeros(needed_bytes).unwrap()`) had to be recovered from a tmux scrollback because the orchestrator's stdout was already torn down by the time we looked.
+
+### The fix
+
+Rewrote `translate_grouped_mm` to gather first, matmul second:
+
+```rust
+// expert_id[m] = first g s.t. m < offs[g], clamped to [0, G-1]
+let expert_id = ge_boundary.sum(0).minimum_f32(g_max_f).cast(DType::Int);
+
+// flat_idx = expert_id * (K*N) + iota('z', (K, N))   — same shape as
+// rust qwen3_moe's `gather_experts`
+let flat_idx = (expert_id * (k * n))
+    .expand_dim(1, k).expand_dim(2, n)
+    + self.graph.iota(Expression::from('z'), (k, n)).expand_dim(0, s);
+
+let weight_gathered = weight.gather(flat_idx);            // [S, K, N], bf16
+let result = input.cast(F32).unsqueeze(1)
+    .matmul(weight_gathered.cast(F32))                    // [S, 1, N]
+    .squeeze(1);
+```
+
+Two important details:
+
+1. **Clamp `expert_id` to `[0, G-1]`**: at search time, dummy data fills `offs` with all-1s (`make_ones_bytes` in `compile_backend`). For S>1 that pushes `expert_id` to G (boundary count = G), which is one past the last valid expert and OOBs the gather. HF's own grouped-MM forward also clamps for the same reason (invalid expert IDs from EP).
+2. **Don't cast the full weight**: the cast moved from before the batched-matmul (over `[G, K, N]`) to after the gather (over `[S, K, N]`). 16× shrink at prefill (S=top_k=8 vs G=128).
+
+### Result
+
+`search-iters=1` end-to-end works on Qwen3-30B-A3B: `BENCH_RESULT … "ttft_ms": 9350.5, "tpot_ms": 1166.7`. The OOM is gone.
+
+`search-iters>=5` still crashes — but with a *different*, downstream `CUDA_ERROR_ILLEGAL_ADDRESS` during execution after search completes. That looks like the same family as the 2026-03-07 / 2026-03-09 egglog-extractor non-determinism bugs (some mutation during search picks a kernel/rewrite combo that's broken at this scale). It's a separate investigation — the gather-based lowering is correct in isolation (`test_grouped_mm_fallback` passes; a synthetic `g=128, S=8, K=2048, N=1536` bf16 test passes with max-diff ~2.4e-4).
+
+### General principle
+
+**When lowering an op that takes a per-row index over a large parameter, gather first and cast second — never cast the full parameter to F32 just because your matmul kernel is F32-only.** A "broadcast over G + mask" pattern is mathematically equivalent to "gather per-row" but materialises a G× larger intermediate — fine for tests, ruinous on real MoE checkpoints. When in doubt, mirror the rust example's pattern: the egglog fusion rules (GLUMoE here) are written to recognise the gather form, not the broadcast-and-mask form.
+
+Also: search-time dummy-1 inputs are not the same shape as runtime inputs. Anything you compute from a runtime tensor (cumsum offsets, routing indices, mask boundaries) needs to remain in-bounds for the dummy. Clamp index-producing chains as a matter of course, not just when the math says you "should" — `make_ones_bytes` is a hostile witness.
+
+---
+
+## 2026-05-01 — `KernelScatter` float4 vectorization wrote 2× past end of buffer for bf16/f16 KV cache
+
+### What the symptom was
+
+After the `translate_grouped_mm` gather rewrite (above) cleared the OOM, the qwen3-moe bench progressed past search but panicked during execution roughly 40% of the time:
+```
+crates/luminal_cuda_lite/src/runtime.rs:1204:
+  CUDA execute error in "CudaGraph":
+    DriverError(CUDA_ERROR_ILLEGAL_ADDRESS, "an illegal memory access was encountered")
+```
+qwen3-4b (dense) was unaffected; the bf16 KV cache in HF `StaticCache` was the only path triggering it. The rust `examples/qwen3_moe` ran fine because it uses an F32 KV cache.
+
+### What the actual root cause was
+
+`KernelScatter::compile` in `crates/luminal_cuda_lite/src/kernel/hlir.rs` emitted a hand-written CUDA copy phase that vectorised through `float4` (16-byte) reads/writes:
+
+```cuda
+long long n_vec = n_dest / 4;          // ← assumes 4-byte dtype
+float4 *out4 = (float4 *)out;
+const float4 *dest4 = (const float4 *)dest;
+for (long long i = tid; i < n_vec; i += blockDim.x) {
+    out4[i] = dest4[i];                 // ← writes 16 B per iteration
+}
+long long remainder_start = n_vec * 4;  // ← also assumes 4 elem/vec
+```
+
+For `dtype=F32` (4 bytes), `n_vec * 16 = n_dest * 4` bytes — exactly fills the buffer. For `dtype=Bf16` (2 bytes), `n_vec * 16 = (n_dest/4) * 16 = n_dest * 4` bytes, which is **2× the actual buffer size of `n_dest * 2` bytes**. The write walks half the buffer past the end of `out` (and reads past `dest`).
+
+Whether that produced an `ILLEGAL_ADDRESS` depended on whether the OOB region happened to land on an unmapped page. For different search outcomes, the surrounding allocator state differed → ~60% it was silent corruption, ~40% it crashed the CUDA context. That probabilistic mix is why the bug had been hidden — no test exercised a bf16 scatter (every existing scatter test uses F32 by default), and the rust example uses F32 KV cache so it was never seen there either.
+
+### Why it was hard to find
+
+1. **Probabilistic, but search-determinate**: the rewrite from HLIR `Scatter` → `KernelScatter` always fires (it's the only non-NoCopy path), so the kernel is always present. The crash depends on memory layout, which depends on which other kernels the search picked. Made it look like an egglog-mutation issue rather than a kernel-correctness issue.
+2. **Existing test coverage was F32-only**: `test_scatter_execution_correctness` (in `tests/consumed_buffer_tests.rs`) explicitly tries 50 random extractions to cover both `Scatter` and `ScatterNoCopy`, but always with `cx.tensor(5)` which defaults to F32. The bug would never surface there.
+3. **The panic message hid the kernel name**: it surfaced as a generic `"CudaGraph"` host-op panic — the cuda_graph_exec batches all kernels into one atomic launch, so the failing kernel disappears into the batch. To localize it I had to add a `LUMINAL_DEBUG_SEQ` env var to `CudaGraphOp::execute_internal` that bypasses graph batching and launches each kernel via `cuLaunchKernel` with a sync afterwards, surfacing kernel name + node + grid/block/pointers when one fails.
+
+### The fix
+
+Parameterise `n_vec` and the remainder-loop start by the number of dtype elements that fit in 16 bytes:
+
+```rust
+let elements_per_vec: usize = match self.dtype {
+    DType::F64 => 2,
+    DType::F32 | DType::Int => 4,
+    DType::F16 | DType::Bf16 | DType::I16 | DType::U16 => 8,
+    DType::Bool | DType::I8 | DType::U8
+        | DType::F8UE8M0 | DType::F8E4M3 | DType::F8E5M2 => 16,
+    other => panic!("Unsupported dtype for scatter vectorization: {other:?}"),
+};
+```
+and substitute `{elements_per_vec}` into the kernel template (both the `n_vec` calc and `remainder_start`). For F32 / Int the generated code is byte-for-byte identical to before, so existing F32 tests are unaffected; for any other dtype the byte coverage now exactly equals `n_dest * sizeof(dtype)` as intended.
+
+### Result
+
+Before fix: 3/5 success at iters=10 (probabilistic).
+After fix: 5/5 at iters=10, 3/3 at iters=50. All 206 HLIR tests still pass. TTFT/TPOT identical (~9.35s / ~1.17s).
+
+### General principle
+
+**Hand-rolled CUDA vectorisation with a fixed-width type (`float4`, `float2`, `int4`, …) is almost always specialised to one element size.** When the same kernel template is parameterised by `dtype`, every byte-count expression has to be too. The cheapest correct form is "elements per vector load" computed from the dtype's byte size — never hardcode `/4`.
+
+Also: **F32 is not a representative test dtype for kernels with vector loads.** When a kernel is written generic-over-dtype, the test matrix needs to actually exercise the dtypes (bf16, f16, bool) where the vector-element-count differs. A `test_scatter_bf16` would have caught this years before the qwen3-moe bench did. Same trap likely exists wherever else `float4` is cast over a `{dtype} *` template.
+
+Diagnostic also added: `LUMINAL_DEBUG_SEQ=1` on the python_luminal path will now bypass `CudaGraphOp` batching at execute time, launching each kernel sequentially with a sync afterwards. If a future ILLEGAL_ADDRESS hides inside a batched graph again, this surfaces the kernel name and node index immediately.
+
--- a/crates/luminal_python/pyproject.toml
+++ b/crates/luminal_python/pyproject.toml
@@ -46,4 +46,5 @@ dev = [
    "transformers>=4.40.0",
    "diffusers>=0.35.0",
    "modal>=1.3.5",
+    "matplotlib>=3.8",
 ]
--- a/crates/luminal_python/rust/src/pt2_parser.rs
+++ b/crates/luminal_python/rust/src/pt2_parser.rs
@@ -160,7 +160,31 @@ pub fn parse_pt2(path: &str) -> Result<ParsedPT2> {
    let file = File::open(path).with_context(|| format!("Failed to open PT2 file: {path}"))?;
    let mut archive = ZipArchive::new(file).context("Failed to read PT2 ZIP archive")?;

-    // Determine archive prefix from the first entry
+    // Torch >= 2.6 uses a flat archive with no prefix directory; detect by presence of the
+    // well-known root-level file.  Older torch used a prefix (e.g. "archive/models/model.json").
+    let is_new_format = archive
+        .file_names()
+        .any(|n| n == "serialized_exported_program.json");
+
+    if is_new_format {
+        let program: ExportedProgram = {
+            let mut entry = archive.by_name("serialized_exported_program.json")?;
+            let mut buf = String::new();
+            entry.read_to_string(&mut buf)?;
+            serde_json::from_str(&buf)
+                .context("Failed to parse serialized_exported_program.json")?
+        };
+        // Tensor constants live in serialized_constants.pt; Python extracts them
+        // and loads them post-compile via set_weight_from_ptr.
+        return Ok(ParsedPT2 {
+            program,
+            constants_config: None,
+            archive_prefix: String::new(),
+            pt2_path: path.to_string(),
+        });
+    }
+
+    // Old prefix-based format.
    let archive_prefix = {
        let first = archive
            .file_names()
--- a/crates/luminal_python/rust/src/translator/dispatch.rs
+++ b/crates/luminal_python/rust/src/translator/dispatch.rs
@@ -183,6 +183,9 @@ impl<'a> Translator<'a> {
            "torch.ops.aten.arange.start_step" => self.translate_arange(node)?,
            "torch.ops.aten.full.default" => self.translate_full(node)?,
            "torch.ops.aten.full_like.default" => self.translate_full_like(node)?,
+            "torch.ops.aten.empty_permuted.default"
+            | "torch.ops.aten.empty.memory_format" => self.translate_empty(node)?,
+            "torch.ops.aten.histc.default" => self.translate_histc(node)?,

            // Grouped matmul (MoE expert dispatch).
            // aten._grouped_mm is the native op; transformers::grouped_mm_fallback
@@ -203,6 +206,7 @@ impl<'a> Translator<'a> {
            "torch.ops.aten.lt.Scalar" => self.translate_scalar_comparison(node, |a, s| a.lt(s))?,
            "torch.ops.aten.ge.Scalar" => self.translate_scalar_comparison(node, |a, s| a.ge(s))?,
            "torch.ops.aten.le.Scalar" => self.translate_scalar_comparison(node, |a, s| a.le(s))?,
+            "torch.ops.aten.eq.Scalar" => self.translate_scalar_comparison(node, |a, s| a.eq(s))?,

            // Tensor comparisons
            "torch.ops.aten.ne.Scalar" => {
@@ -293,24 +297,40 @@ impl<'a> Translator<'a> {
            }
            "torch.ops.aten.erf.default" => {
                let a = self.get_input_tensor(node, 0)?;
-                // Abramowitz & Stegun approximation 7.1.28 (max error ~1.5e-7)
-                // erf(x) = sign(x) * (1 - poly(t) * exp(-x^2))
-                // where t = 1/(1 + 0.3275911*|x|), poly in Horner form
-                let ax = a.abs();
-                let x2 = a * a;
-                let t = (ax * 0.3275911_f32 + 1.0).reciprocal();
-                // Horner: t*(a1 + t*(a2 + t*(a3 + t*(a4 + t*a5))))
-                let poly = t
-                    * (t * (t
-                        * (t * (t * 1.061_405_4_f32 + (-1.453_152_1_f32)) + 1.421_413_8_f32)
-                        + (-0.284_496_72_f32))
-                        + 0.254_829_6_f32);
-                let result_abs =
-                    self.graph.constant_float(1.0).expand_rhs(a.shape) - poly * (x2 * (-1.0)).exp();
-                // sign(x) = 2*(x >= 0) - 1
-                let zero = self.graph.constant_float(0.0).expand_rhs(a.shape);
-                let sign = a.ge(zero).cast(DType::F32) * 2.0 - 1.0;
-                result_abs * sign
+                self.erf_approx(a)
+            }
+            "torch.ops.aten.gelu.default" => {
+                let a_in = self.get_input_tensor(node, 0)?;
+                // PyTorch's gelu has a kwarg `approximate` (default "none").
+                // "none"  → 0.5 * x * (1 + erf(x / sqrt(2)))           (exact)
+                // "tanh"  → 0.5 * x * (1 + tanh(c * (x + 0.044715*x^3)))
+                //          where c = sqrt(2/pi) ≈ 0.7978845608
+                // Gemma family uses approximate="tanh" but lowering may emit
+                // either form; honour whatever the FX graph carries.
+                let approximate = node.inputs.iter().find_map(|input| {
+                    if input.name == "approximate"
+                        && let Argument::Other(val) = &input.arg
+                    {
+                        return val.as_str().map(|s| s.to_string());
+                    }
+                    None
+                });
+                // Promote to F32 around the constants/comparisons (same reason
+                // as clamp/erf — luminal binary ops assert matching dtypes).
+                let orig = a_in.dtype;
+                let a = if orig == DType::F32 { a_in } else { a_in.cast(DType::F32) };
+                let half = self.graph.constant_float(0.5).expand_rhs(a.shape);
+                let one = self.graph.constant_float(1.0).expand_rhs(a.shape);
+                let result = if approximate.as_deref() == Some("tanh") {
+                    let x2 = a * a;
+                    let inner = a * (x2 * 0.044715_f32 + 1.0) * 0.797_884_56_f32;
+                    half * a * (one + inner.tanh())
+                } else {
+                    let scaled = a * 0.707_106_77_f32; // 1 / sqrt(2)
+                    let erf_val = self.erf_approx(scaled);
+                    half * a * (one + erf_val)
+                };
+                if orig == DType::F32 { result } else { result.cast(orig) }
            }
            "torch.ops.aten.isnan.default" => {
                let a = self.get_input_tensor(node, 0)?;
--- a/crates/luminal_python/rust/src/translator/mod.rs
+++ b/crates/luminal_python/rust/src/translator/mod.rs
@@ -68,6 +68,9 @@ impl<'a> Translator<'a> {
    fn translate_graph(&mut self) -> Result<()> {
        self.create_inputs()?;

+        // Per-block partitioning is now handled automatically by the upstream
+        // loop-rolling prepass; this translator no longer needs to insert
+        // manual graph breaks at RMSNorm boundaries.
        let nodes = &self.parsed.program.graph_module.graph.nodes;
        for (i, node) in nodes.iter().enumerate() {
            self.translate_node(node)
@@ -336,3 +339,4 @@ impl<'a> Translator<'a> {
        None
    }
 }
+
--- a/crates/luminal_python/rust/src/translator/movement.rs
+++ b/crates/luminal_python/rust/src/translator/movement.rs
@@ -389,55 +389,100 @@ impl<'a> Translator<'a> {

    pub(crate) fn translate_index_put(&mut self, node: &Node) -> Result<GraphTensor> {
        let a = self.get_input_tensor(node, 0)?;
-        let index_names = node.inputs[1]
-            .arg
-            .as_tensors()
-            .context("index_put: indices not as_tensors")?;
        let values = self.get_input_tensor(node, 2)?;

-        if index_names.len() == 1 {
-            let idx_tensor = self.get_tensor(&index_names[0].name)?;
+        // --- all-tensor indices: bool-mask blend or scatter_nd ---
+        if let Some(index_names) = node.inputs[1].arg.as_tensors() {
+            if index_names.len() == 1 {
+                let idx_tensor = self.get_tensor(&index_names[0].name)?;

-            // Boolean-mask index_put: when the only index is a Bool tensor whose
-            // shape matches the data tensor, PyTorch semantics are
-            //   data[mask] = value   ↔   where(mask, value, data)
-            // NOT a scatter into positions. Casting the Bool mask to Int and
-            // feeding it to scatter_nd would reinterpret True/False as row
-            // indices 1/0 and silently corrupt the data. Reproducer:
-            //   x = arange(16).reshape(4, 4); mask = zeros(4, 4, dtype=bool)
-            //   y = x.clone(); y[mask] = 99   # eager: y == x (no-op)
-            // Pre-fix the compiled graph wrote 99 to row 0; this branch
-            // ensures the bool-mask path lowers to a where-blend instead.
-            if idx_tensor.dtype == DType::Bool && idx_tensor.shape.dims == a.shape.dims {
-                // Broadcast the (often scalar) value tensor to match data shape,
-                // then blend by mask. Cast mask to data's dtype for the arithmetic
-                // so this works for both integer and float data.
-                let mask_f = idx_tensor.cast(a.dtype);
-                let values_b = values.cast(a.dtype).expand_rhs(a.shape);
-                // Implements where(mask, value, a) as
-                //     a*(1 - mask) + value*mask
-                // — works without a dedicated cond op for any numeric dtype.
-                let one = self
-                    .graph
-                    .constant_float(1.0)
-                    .cast(a.dtype)
-                    .expand_rhs(a.shape);
-                return Ok(a * (one - mask_f) + values_b * mask_f);
+                // Boolean-mask index_put: when the only index is a Bool tensor whose
+                // shape matches the data tensor, PyTorch semantics are
+                //   data[mask] = value   ↔   where(mask, value, data)
+                // NOT a scatter into positions. Casting the Bool mask to Int and
+                // feeding it to scatter_nd would reinterpret True/False as row
+                // indices 1/0 and silently corrupt the data. Reproducer:
+                //   x = arange(16).reshape(4, 4); mask = zeros(4, 4, dtype=bool)
+                //   y = x.clone(); y[mask] = 99   # eager: y == x (no-op)
+                // Pre-fix the compiled graph wrote 99 to row 0; this branch
+                // ensures the bool-mask path lowers to a where-blend instead.
+                if idx_tensor.dtype == DType::Bool && idx_tensor.shape.dims == a.shape.dims {
+                    let mask_f = idx_tensor.cast(a.dtype);
+                    let values_b = values.cast(a.dtype).expand_rhs(a.shape);
+                    // Implements where(mask, value, a) as
+                    //     a*(1 - mask) + value*mask
+                    // — works without a dedicated cond op for any numeric dtype.
+                    let one = self
+                        .graph
+                        .constant_float(1.0)
+                        .cast(a.dtype)
+                        .expand_rhs(a.shape);
+                    return Ok(a * (one - mask_f) + values_b * mask_f);
+                }
+
+                // Integer-index scatter: index_put with indices=[idx_tensor] writes
+                // into dim 0 of `a` at every position named in idx_tensor (flattened),
+                // broadcasting values across the trailing dims of `a`. Always pad
+                // a trailing size-1 dim so rank-1 and rank-N cases share a path.
+                let indices = idx_tensor.cast(DType::Int);
+                let new_last = indices.shape.len();
+                let indices = indices.expand_dim(new_last, Expression::from(1usize));
+                return Ok(a.scatter_nd(indices, values));
+            }
+            bail!("index_put with multiple all-tensor indices not yet supported");
+        }
+
+        // --- optional-tensor indices: [None, arange_tensor, None, ...] ---
+        // Each None means "all of that dimension"; one tensor means "index into that dim".
+        // StaticCache uses this for KV updates: cache[:, :, position, :] = new_value.
+        if let Some(opt_tensors) = node.inputs[1].arg.as_optional_tensors() {
+            use crate::pt2_schema::OptionalTensorEntry;
+            let mut first_non_none_dim = 0usize;
+            let mut idx_name: Option<String> = None;
+            let mut non_none_count = 0usize;
+
+            for (i, entry) in opt_tensors.iter().enumerate() {
+                if let OptionalTensorEntry::Tensor(t) = entry {
+                    if idx_name.is_none() {
+                        first_non_none_dim = i;
+                    }
+                    idx_name = Some(t.as_tensor.name.clone());
+                    non_none_count += 1;
+                }
            }

-            // Integer-index scatter: index_put with indices=[idx_tensor] writes
-            // into dim 0 of `a` at every position named in idx_tensor (flattened),
-            // broadcasting values across the trailing dims of `a`. idx_tensor can
-            // be ANY shape — its whole shape is "batch dims" in scatter_nd terms,
-            // and K is always 1 (number of dims we're indexing into). Always pad
-            // a trailing size-1 dim so the rank-1 and rank-N cases share a path.
-            let indices = idx_tensor.cast(DType::Int);
-            let new_last = indices.shape.len();
-            let indices = indices.expand_dim(new_last, Expression::from(1usize));
-            Ok(a.scatter_nd(indices, values))
-        } else {
-            bail!("index_put with multiple index tensors not yet supported");
+            if non_none_count != 1 {
+                bail!(
+                    "index_put with optional tensors: only single non-None index supported \
+                     (got {non_none_count})"
+                );
+            }
+
+            let mut indices = self.get_tensor(&idx_name.unwrap())?.cast(DType::Int);
+
+            // Expand 1-D indices [P] to values.shape for scatter_elements:
+            // Build [1, ..., 1, P, 1, ..., 1] with P at first_non_none_dim, then broadcast.
+            let rank = a.shape.len();
+            // Insert singleton dims before first_non_none_dim
+            for i in 0..first_non_none_dim {
+                indices = indices.expand_dim(i, Expression::from(1usize));
+            }
+            // Insert singleton dims after first_non_none_dim
+            let current_rank = indices.shape.len();
+            for j in current_rank..rank {
+                indices = indices.expand_dim(j, Expression::from(1usize));
+            }
+            // Broadcast singletons to values shape
+            let values_shape: Vec<Expression> = values.shape.dims[..rank].to_vec();
+            indices.shape.expand(values_shape);
+
+            return Ok(a.scatter_elements(indices, values, first_non_none_dim));
        }
+
+        bail!(
+            "index_put: unsupported indices format: {:?}",
+            node.inputs[1].arg
+        )
    }

    pub(crate) fn translate_split_with_sizes(&mut self, node: &Node) -> Result<GraphTensor> {
--- a/crates/luminal_python/rust/src/translator/tensor.rs
+++ b/crates/luminal_python/rust/src/translator/tensor.rs
@@ -72,6 +72,73 @@ impl<'a> Translator<'a> {
        })
    }

+    /// Translate `aten.histc.default(input, bins, min, max)` → `Tensor[bins]`.
+    ///
+    /// Counts how many input elements fall in each of `bins` equal-width
+    /// buckets over `[min, max]`. PyTorch's histc accepts only 1D input;
+    /// HF MoE forwards emit it on flattened expert-assignment tensors to
+    /// produce per-expert token counts (one_hot + sum, essentially).
+    ///
+    /// Implementation: arange over bins, broadcast to [G, N], element-wise
+    /// `(lower <= input < upper)` into a F32 mask, sum over the input axis.
+    /// The right edge of the last bin is technically inclusive in PyTorch;
+    /// we treat it as exclusive — for the typical MoE use (integer expert
+    /// IDs in `[0, num_experts)`), no input ever equals `max` so this is
+    /// indistinguishable.
+    pub(crate) fn translate_histc(&mut self, node: &Node) -> Result<GraphTensor> {
+        let input = self.get_input_tensor(node, 0)?;
+        let bins = self.get_int_arg(node, 1)? as usize;
+        let min_val = self.get_float_arg(node, 2)? as f32;
+        let max_val = self.get_float_arg(node, 3)? as f32;
+
+        anyhow::ensure!(
+            input.shape.len() == 1,
+            "histc: only 1D input supported (got {}D)",
+            input.shape.len()
+        );
+        let n = input.shape.dims[0];
+        let g = Expression::from(bins);
+
+        let input_f = input.cast(DType::F32);
+        let step = (max_val - min_val) / bins as f32;
+
+        // Per-bin lower edges: arange(bins) * step + min.
+        let bin_idx = self.graph.arange(g).cast(DType::F32);
+        let lower_1d = bin_idx * step + min_val;
+        let upper_1d = lower_1d + step;
+
+        // Broadcast to [G, N] and produce the boolean mask.
+        let input_b = input_f.expand_dim(0, g);
+        let lower = lower_1d.expand_dim(1, n);
+        let upper = upper_1d.expand_dim(1, n);
+
+        let in_lower = input_b.ge(lower).cast(DType::F32);
+        let in_upper = input_b.lt(upper).cast(DType::F32);
+        let mask = in_lower * in_upper;
+
+        Ok(mask.sum(1))
+    }
+
+    /// Translate `aten.empty_permuted.default(size, physical_layout, **kwargs)`
+    /// → zero-filled tensor of shape `size`.
+    ///
+    /// PyTorch's `empty_permuted` allocates uninitialized memory with a given
+    /// stride permutation; downstream code typically overwrites every element
+    /// before reading. Luminal's tensor abstraction doesn't expose strides, so
+    /// the physical_layout hint is irrelevant — we just emit a zero tensor of
+    /// the requested shape and dtype. (Same approach works for `aten.empty`
+    /// variants when they show up.)
+    pub(crate) fn translate_empty(&mut self, node: &Node) -> Result<GraphTensor> {
+        let shape = self.get_exprs_arg(node, 0)?;
+        let dtype = self.output_meta_dtype(node)?;
+        let value = self.graph.constant_float(0.0).cast(dtype);
+        Ok(if shape.is_empty() {
+            value
+        } else {
+            value.expand_rhs(shape)
+        })
+    }
+
    pub(crate) fn translate_full_like(&mut self, node: &Node) -> Result<GraphTensor> {
        let reference = self.get_input_tensor(node, FULL_LIKE_INPUT_ARG)?;
        let val = if let Ok(f) = self.get_float_arg(node, FULL_LIKE_VALUE_ARG) {
@@ -109,13 +176,18 @@ impl<'a> Translator<'a> {
    /// Output `[S, N]` where token m (in group g s.t. `offs[g-1] <= m < offs[g]`)
    /// is multiplied by `weight[g]`.
    ///
-    /// Implementation:
-    ///   1. Batched matmul across every expert: `[G, S, K] @ [G, K, N] → [G, S, N]`
-    ///      (input broadcast along the G batch dim — matches luminal's 3D@3D pattern
-    ///      so the CUDA optimizer can fuse it into a batched GEMM).
-    ///   2. Build a `[G, S]` group-membership mask from `offs`:
-    ///      `expert_id[m] = Σ_g (offs[g] <= m)`, then `mask[g, m] = (g == expert_id[m])`.
-    ///   3. Multiply `[G, S, N]` result by the broadcast mask and sum over `G`.
+    /// Implementation: for each token m we (a) compute its expert id from offs,
+    /// (b) gather only that expert's `[K, N]` slice from weight, and (c) do a
+    /// single per-token matmul. The gather pattern mirrors the rust qwen3_moe
+    /// example's `gather_experts`, which the GLUMoE host-op fusion in
+    /// `luminal_cuda_lite` is designed to recognise.
+    ///
+    /// Why not the straightforward `[G, S, K] @ [G, K, N] → [G, S, N]` + mask:
+    /// it forces a full F32 cast of the entire `[G, K, N]` weight tensor as
+    /// search-time intermediate, which OOMs on real MoE checkpoints
+    /// (Qwen3-30B-A3B: 1.5 GB / layer × 48 layers for gate-up alone). Gathering
+    /// first keeps the F32 cast on `[S, K, N]` instead — for prefill (S = top_k)
+    /// that is a 16× shrink (G=128, top_k=8).
    ///
    /// `offs` flows through as a runtime tensor — the routing decision is computed
    /// at execution time by the gate network and the same compiled graph handles
@@ -143,33 +215,54 @@ impl<'a> Translator<'a> {

        let s = input.shape.dims[0];
        let g = weight.shape.dims[0];
+        let k = weight.shape.dims[1];
        let n = weight.shape.dims[2];

-        let input_f = input.cast(DType::F32);
-        let weight_f = weight.cast(DType::F32);
+        // expert_id[m] = number of g s.t. m >= offs[g]
+        //              = first g s.t. m < offs[g], i.e. the expert assigned to m.
+        // Clamp to [0, G-1] before using as gather index. Matches HF MoE's
+        // `expert_ids.clamp(0, num_experts-1)` for invalid IDs from EP, AND
+        // protects search-time profiling: dummy-1 input bytes give offs=[1,…,1],
+        // which makes `m >= offs[g]` true for m≥1 and pushes expert_id to G,
+        // out of bounds for the weight gather. Clamping keeps the gather safe.
+        let g_max_f = (g
+            .to_usize()
+            .context("_grouped_mm: G (num_experts) must be concrete")?
+            as f32)
+            - 1.0;
        let offs_f = offs.cast(DType::F32);
-
-        // Batched matmul over every expert: [G, S, K] @ [G, K, N] → [G, S, N].
-        let input_batched = input_f.expand_dim(0, g);
-        let all_out = input_batched.matmul(weight_f);
-
-        // Group mask [G, S].
-        let s_arange = self.graph.arange(s).cast(DType::F32);
-        let g_arange = self.graph.arange(g).cast(DType::F32);
-        let ge_boundary = s_arange
+        let s_arange_f = self.graph.arange(s).cast(DType::F32);
+        let ge_boundary = s_arange_f
            .expand_dim(0, g)
            .ge(offs_f.expand_dim(1, s))
            .cast(DType::F32);
-        let expert_id = ge_boundary.sum(0);
-        let mask = g_arange
-            .expand_dim(1, s)
-            .eq(expert_id.expand_dim(0, g))
-            .cast(DType::F32);
+        let expert_id = ge_boundary
+            .sum(0)
+            .minimum_f32(g_max_f)
+            .cast(DType::Int); // [S] Int

-        // Apply mask and sum over experts.
-        let out = (all_out * mask.expand_dim(2, n)).sum(0);
+        // Flat gather index into weight (treated as a length-G*K*N 1D buffer):
+        //   flat[m, k_, n_] = expert_id[m] * (K*N) + k_ * N + n_
+        // Encoded as `Mul(expert_id, Iota(io_const)) + Iota(MIter, K*N)` so the
+        // resulting Gather matches the GLUMoE / gather-experts egglog patterns.
+        let io = k * n;
+        let base = expert_id * io;
+        let within = self.graph.iota(Expression::from('z'), (k, n));
+        let exp_base = base.expand_dim(1, k).expand_dim(2, n);
+        let exp_within = within.expand_dim(0, s);
+        let flat_idx = exp_base + exp_within;

-        Ok(out.cast(input.dtype))
+        // Gather → [S, K, N]. Preserves weight's native dtype (bf16 stays bf16).
+        let weight_gathered = weight.gather(flat_idx);
+
+        // Cast for matmul — now on the small gathered slice, not the full weight.
+        let input_f = input.cast(DType::F32);
+        let weight_f = weight_gathered.cast(DType::F32);
+
+        // Per-token matmul: [S, 1, K] @ [S, K, N] → [S, 1, N] → [S, N].
+        let result = input_f.unsqueeze(1).matmul(weight_f).squeeze(1);
+
+        Ok(result.cast(input.dtype))
    }

    pub(crate) fn translate_where(&mut self, node: &Node) -> Result<GraphTensor> {
--- a/crates/luminal_python/rust/src/translator/unary.rs
+++ b/crates/luminal_python/rust/src/translator/unary.rs
@@ -257,13 +257,54 @@ impl<'a> Translator<'a> {
            None
        };

-        let mut result = a;
+        // maximum_f32 / minimum_f32 internally use `.lt(F32 scalar)`, which
+        // asserts matching tensor dtypes. Without this, clamp on an Int tensor
+        // (e.g. Qwen3-MoE routes `cache_position.clamp(...)` through here)
+        // panics inside luminal core. Promote to F32 around the bounds check
+        // and cast back at the end.
+        let original_dtype = a.dtype;
+        let needs_promote = original_dtype != DType::F32;
+        let mut result = if needs_promote { a.cast(DType::F32) } else { a };
        if let Some(min) = min_val {
            result = result.maximum_f32(min);
        }
        if let Some(max) = max_val {
            result = result.minimum_f32(max);
        }
+        if needs_promote {
+            result = result.cast(original_dtype);
+        }
        Ok(result)
    }
+
+    /// Compute `erf(a)` via the Abramowitz & Stegun 7.1.28 approximation
+    /// (max error ~1.5e-7). Shared by `aten.erf.default` and the exact
+    /// `aten.gelu.default` (which is `0.5 * x * (1 + erf(x / sqrt(2)))`).
+    ///
+    /// erf(x) = sign(x) * (1 - poly(t) * exp(-x^2))
+    /// where t = 1/(1 + 0.3275911*|x|), poly is degree 5 in Horner form.
+    ///
+    /// Promotes the input to F32 internally (the approximation constants are
+    /// F32 anyway, and luminal's binary ops assert matching dtypes — running
+    /// this on Bf16 input directly trips the assertion at `a.ge(zero)`).
+    /// Restores the original dtype on return.
+    pub(crate) fn erf_approx(&mut self, a: GraphTensor) -> GraphTensor {
+        let orig = a.dtype;
+        let a = if orig == DType::F32 { a } else { a.cast(DType::F32) };
+        let ax = a.abs();
+        let x2 = a * a;
+        let t = (ax * 0.3275911_f32 + 1.0).reciprocal();
+        let poly = t
+            * (t * (t
+                * (t * (t * 1.061_405_4_f32 + (-1.453_152_1_f32)) + 1.421_413_8_f32)
+                + (-0.284_496_72_f32))
+                + 0.254_829_6_f32);
+        let result_abs =
+            self.graph.constant_float(1.0).expand_rhs(a.shape) - poly * (x2 * (-1.0)).exp();
+        // sign(x) = 2*(x >= 0) - 1
+        let zero = self.graph.constant_float(0.0).expand_rhs(a.shape);
+        let sign = a.ge(zero).cast(DType::F32) * 2.0 - 1.0;
+        let result = result_abs * sign;
+        if orig == DType::F32 { result } else { result.cast(orig) }
+    }
 }
--- a/crates/luminal_python/src/luminal/init.py
+++ b/crates/luminal_python/src/luminal/init.py
@@ -2,6 +2,8 @@

 # Import Python components
 # Register DynamicCache pytree serialization once at import time
+import torch.export._unlift as _torch_export_unlift
+
 from .cache_utils import _register_cache_serialization
 from .compiled_model import CompiledModel

@@ -11,6 +13,49 @@ from .main import luminal_backend, register_backend

 _register_cache_serialization()

+# ---------------------------------------------------------------------------
+# Suppress torch.export's `_guards_fn` insertion when luminal is on the stack.
+#
+# When `torch._dynamo.config.automatic_dynamic_shapes=True` (the default) and
+# a model is called with shapes that vary across calls, dynamo promotes the
+# changing dim to a SymInt and re-traces. During the re-trace, torch.export's
+# `_unlift_exported_program_lifted_states` (in `torch/export/_unlift.py`)
+# generates a `_guards_fn` submodule whose body closes over `L` — dynamo's
+# locals namespace. When aot_autograd later evaluates the resulting
+# GraphModule via fx.Interpreter, the closure's free `L` reference doesn't
+# resolve and we get
+#       NameError: name 'L' is not defined
+# (gemma3 + StaticCache reproduces this deterministically).
+#
+# torch.export's own opt-out — `_ok_to_generate_guards_fn` — already walks
+# the call stack for filename patterns to suppress guard generation for
+# specific embedders (executorch, modai, on_device_ai, torchao). Add
+# "luminal" to the same suppression set by monkey-patching the function.
+# Net effect: torch.export never inserts `_guards_fn`, so re-tracing
+# succeeds, dynamic-shape compile-once-run-many works, and StaticCache
+# decode loops compile in ~one shot instead of per-token.
+# ---------------------------------------------------------------------------
+
+_orig_ok_to_generate_guards_fn = _torch_export_unlift._ok_to_generate_guards_fn
+
+
+def _luminal_aware_ok_to_generate_guards_fn() -> bool:
+    """Return False whenever luminal is anywhere in the call stack."""
+    import inspect
+
+    frame = inspect.currentframe()
+    try:
+        while frame is not None:
+            if "luminal" in frame.f_code.co_filename:
+                return False
+            frame = frame.f_back
+    finally:
+        del frame  # avoid reference cycle
+    return _orig_ok_to_generate_guards_fn()
+
+
+_torch_export_unlift._ok_to_generate_guards_fn = _luminal_aware_ok_to_generate_guards_fn
+
 # Re-export everything for clean package interface
 __all__ = [
    "CompiledModel",
--- a/crates/luminal_python/src/luminal/main.py
+++ b/crates/luminal_python/src/luminal/main.py
@@ -10,8 +10,17 @@ from .dtype_util import torch_dtype_code as _torch_dtype_code


 def _detect_factory_capsule(example_inputs):
-    """Pick the best built-in factory capsule based on input device."""
-    device = example_inputs[0].device if example_inputs else torch.device("cpu")
+    """Pick the best built-in factory capsule based on input device.
+
+    Walks example_inputs for the first Tensor to read .device from. With
+    dynamic=True, dynamo may pass SymInt/SymFloat alongside Tensors and those
+    don't have a .device attribute — falling back to CPU on a SymInt-only call
+    would silently route to the wrong backend, so prefer the first Tensor."""
+    device = torch.device("cpu")
+    for v in example_inputs or ():
+        if isinstance(v, torch.Tensor):
+            device = v.device
+            break
    if device.type == "cuda":
        try:
            from .luminal import _cuda_lite_factory_capsule
@@ -76,7 +85,7 @@ def register_backend(factory_capsule):
    """

    def backend(gm, example_inputs, options=None):
-        return _compile_pt2(gm, example_inputs, factory_capsule)
+        return _compile_pt2(gm, example_inputs, factory_capsule, options=options)

    return backend

@@ -95,7 +104,7 @@ def luminal_backend(gm, example_inputs, options=None):
    For external backends, use register_backend with the backend's factory capsule.
    """
    capsule = _detect_factory_capsule(example_inputs)
-    return _compile_pt2(gm, example_inputs, capsule)
+    return _compile_pt2(gm, example_inputs, capsule, options=options)


 # ---------------------------------------------------------------------------
@@ -103,8 +112,16 @@ def luminal_backend(gm, example_inputs, options=None):
 # ---------------------------------------------------------------------------


-def _compile_pt2(gm, example_inputs, factory_capsule):
+def _compile_pt2(gm, example_inputs, factory_capsule, options=None):
    """PT2/torch.export path — delegates to pt2.pt2_backend."""
    from .pt2 import pt2_backend

-    return pt2_backend(gm, example_inputs, factory=factory_capsule)
+    search_iterations = None
+    if options is not None:
+        search_iterations = options.get("search_iterations")
+    return pt2_backend(
+        gm,
+        example_inputs,
+        factory=factory_capsule,
+        search_iterations=search_iterations,
+    )
--- a/crates/luminal_python/src/luminal/pt2.py
+++ b/crates/luminal_python/src/luminal/pt2.py
@@ -110,6 +110,44 @@ def _export_kwargs():
    return kwargs


+def _extract_pt2_constants(pt2_path):
+    """Extract tensor constants from the new flat PT2 format (torch >= 2.6).
+
+    In the new format, inline constants (e.g. ``torch.tensor([1., 2.])``) are
+    stored in ``serialized_constants.pt`` rather than individual ZIP entries.
+    The Rust parser skips them (returns constants_config=None); this function
+    reads them back and returns a cpu_ptrs dict ready for _load_cpu_weights.
+
+    Returns (keep_alive, cpu_ptrs) — keep_alive must stay alive until after
+    _load_cpu_weights returns (set_weight_from_ptr copies the bytes).
+    """
+    import io
+    import zipfile
+
+    from .dtype_util import torch_dtype_code as _torch_dtype_code
+
+    try:
+        with zipfile.ZipFile(pt2_path) as z:
+            if "serialized_constants.pt" not in z.namelist():
+                return [], {}
+            data = z.read("serialized_constants.pt")
+    except Exception:
+        return [], {}
+
+    constants = torch.load(io.BytesIO(data), weights_only=False)
+    if not constants:
+        return [], {}
+
+    keep_alive = []
+    cpu_ptrs = {}
+    for name, tensor in constants.items():
+        t = tensor.detach().cpu().contiguous()
+        keep_alive.append(t)
+        n_bytes = t.numel() * t.element_size()
+        cpu_ptrs[name] = (t.data_ptr(), n_bytes, _torch_dtype_code(t.dtype))
+    return keep_alive, cpu_ptrs
+
+
 def _save_and_compile(ep_or_path, factory, search_iterations, original_weights=None):
    """Compile a PT2 model via Rust, return CompiledModel.

@@ -145,8 +183,12 @@ def _save_and_compile(ep_or_path, factory, search_iterations, original_weights=N
            pt2_path, "", search_iterations, factory, weight_device_ptrs
        )

-        # Load CPU weights after compilation
+        # Load CPU weights; also load inline tensor constants from the new flat
+        # PT2 format (torch >= 2.6 stores them in serialized_constants.pt).
+        const_keep_alive, const_cpu_weights = _extract_pt2_constants(pt2_path)
+        cpu_weights.update(const_cpu_weights)
        _load_cpu_weights(compiled, cpu_weights)
+        del const_keep_alive  # bytes were copied by set_weight_from_ptr

        return CompiledModel(compiled, weight_refs=keep_alive)
    finally:
@@ -158,13 +200,21 @@ def _reinternalize_lifted_params(gm, example_inputs):
    """Re-internalize lifted params as buffers so torch.export sees them as model state.

    torch.compile lifts model parameters out of the module and passes them as
-    extra elements in example_inputs.  The Rust PT2 compiler may expect weights in
-    the .pt2 state dict, not as runtime inputs.  This function reverses the
+    extra elements in example_inputs. The Rust PT2 compiler may expect weights in
+    the .pt2 state dict, not as runtime inputs. This function reverses the
    lifting by registering them as buffers and replacing the placeholder nodes
    with get_attr nodes.

+    SymInt/SymFloat/SymBool values in example_inputs are rejected by
+    torch.export.export as user inputs ("Unsupported input type
+    <class 'torch.SymInt'>"). We don't restructure the graph for this — we
+    specialize the *value* to its concrete hint (a plain int/float/bool), which
+    torch.export accepts. The placeholder stays in place; the traced graph
+    proceeds as if dynamo had specialized this dim. Invisible to callers of
+    `torch.compile(..., backend=luminal_backend)`.
+
    Returns (gm, user_inputs, original_weights) where:
-      - user_inputs contains only the real inputs
+      - user_inputs contains only real inputs (Tensors and concrete scalars)
      - original_weights maps buffer name -> original tensor (for zero-copy device pointers)
    """
    buffer_indices = []
@@ -198,14 +248,49 @@ def _reinternalize_lifted_params(gm, example_inputs):
        gm.graph.lint()
        gm.recompile()

-    user_inputs = (
+    raw_user_inputs = (
        [example_inputs[i] for i in user_indices]
        if user_indices
        else list(example_inputs)
    )
+    user_inputs = [
+        _specialize_sym_scalar(v) if _is_sym_scalar(v) else v
+        for v in raw_user_inputs
+    ]
    return gm, user_inputs, original_weights


+def _is_sym_scalar(val) -> bool:
+    """True for torch SymInt/SymFloat/SymBool — anything torch.export's fakify
+    rejects as a user input. Plain int/float/bool are fine; only the symbolic
+    wrappers need specialization."""
+    if val is None:
+        return False
+    if isinstance(val, torch.Tensor):
+        return False
+    return type(val).__name__ in ("SymInt", "SymFloat", "SymBool") or isinstance(
+        val, (torch.SymInt, torch.SymFloat, torch.SymBool)
+    )
+
+
+def _specialize_sym_scalar(val):
+    """Resolve a SymInt/SymFloat/SymBool to its concrete hint. Falls back to
+    str(val) -> primitive parse if the SymNode hint is missing."""
+    try:
+        if isinstance(val, torch.SymBool):
+            return bool(val)
+        if isinstance(val, torch.SymFloat):
+            return float(val)
+        return int(val)
+    except Exception:
+        # SymNodes without a hint — try parsing the str form as a last resort.
+        s = str(val)
+        try:
+            return int(s)
+        except ValueError:
+            return float(s)
+
+
 # ---------------------------------------------------------------------------
 # Public API
 # ---------------------------------------------------------------------------
@@ -288,7 +373,7 @@ def compile(
    return _save_and_compile(ep, factory, search_iterations)


-def pt2_backend(gm, example_inputs, factory=None):
+def pt2_backend(gm, example_inputs, factory=None, search_iterations=None):
    """torch.compile backend using PT2 pipeline.

    Usage: torch.compile(model, backend=luminal.register_backend(capsule))
@@ -297,6 +382,8 @@ def pt2_backend(gm, example_inputs, factory=None):

    if factory is None:
        factory = _detect_factory_capsule(example_inputs)
+    if search_iterations is None:
+        search_iterations = 10

    gm = gm.eval()
    gm, user_inputs, original_weights = _reinternalize_lifted_params(gm, example_inputs)
@@ -304,6 +391,28 @@ def pt2_backend(gm, example_inputs, factory=None):
    ep = torch.export.export(gm, tuple(user_inputs), **_export_kwargs())
    ep = ep.run_decompositions()

+    # Detect USER_INPUT_MUTATION outputs (e.g., in-place KV cache updates).
+    # These must be written back to the original input tensors after each call.
+    # Only USER_OUTPUT results are returned to the torch.compile caller.
+    try:
+        from torch.export.graph_signature import OutputKind
+
+        mutation_mappings = []  # list of (compiled_output_idx, user_input_idx)
+        user_output_indices = []
+        for i, spec in enumerate(ep.graph_signature.output_specs):
+            if spec.kind == OutputKind.USER_INPUT_MUTATION:
+                # target is 'args_N' — index into user_inputs
+                try:
+                    arg_idx = int(spec.target.split("_")[1])
+                    mutation_mappings.append((i, arg_idx))
+                except (ValueError, IndexError):
+                    user_output_indices.append(i)
+            else:
+                user_output_indices.append(i)
+    except ImportError:
+        mutation_mappings = []
+        user_output_indices = None  # unknown; return all outputs
+
    # When using shared memory (original_weights), strip large weight buffers from
    # the EP before saving.  The Rust side uses device pointers for these weights,
    # not the .pt2 file data, so serializing them is pure IO waste (~32 GB for 8B
@@ -329,8 +438,27 @@ def pt2_backend(gm, example_inputs, factory=None):

    try:
        result = _save_and_compile(
-            pt2_path, factory, 10, original_weights=original_weights
+            pt2_path, factory, search_iterations, original_weights=original_weights
        )
-        return result
    finally:
        shutil.rmtree(tmpdir, ignore_errors=True)
+
+    # Wrap the compiled model to handle USER_INPUT_MUTATION: write updated tensors
+    # back into the original input buffers and return only USER_OUTPUT tensors.
+    if mutation_mappings:
+        _compiled = result
+        _mut = mutation_mappings
+        _usr = user_output_indices
+
+        def _mutation_wrapper(*inputs):
+            outputs = _compiled(*inputs)
+            for out_idx, inp_idx in _mut:
+                if inp_idx < len(inputs) and out_idx < len(outputs):
+                    inputs[inp_idx].copy_(outputs[out_idx])
+            if _usr is not None:
+                return tuple(outputs[i] for i in _usr if i < len(outputs))
+            return outputs
+
+        return _mutation_wrapper
+
+    return result
--- a/crates/luminal_python/tests/test_hlir_ops.py
+++ b/crates/luminal_python/tests/test_hlir_ops.py
@@ -170,7 +170,8 @@ from test_models import (
    ScatterElementsAxis0TestModel,
    # ScatterElements models
    ScatterElementsTestModel,
-    # ScatterND model
+    # ScatterND / IndexPut models
+    IndexPutOptionalModel,
    ScatterNDTestModel,
    ShapeReshapeBatchFlattenModel,
    ShapeReshapeKeepBatchModel,
@@ -2081,6 +2082,16 @@ def test_scatter_nd(device: torch.device):
    assert torch.allclose(output, original)


+def test_index_put_optional(device: torch.device):
+    """Tests index_put with optional (None) indices — mirrors StaticCache KV update."""
+    model: torch.nn.Module = IndexPutOptionalModel().to(device)
+    model_compiled: Callable = torch.compile(model, backend=luminal_backend)
+    x: torch.Tensor = torch.zeros(2, 2, 8, 4, device=device)
+    original: torch.Tensor = model(x)
+    output: torch.Tensor = model_compiled(x)
+    assert torch.allclose(output, original, atol=1e-5)
+
+
 # ========== Bool-mask index_put correctness tests ==========
 #
 # `x[bool_mask] = scalar` is semantically `where(mask, scalar, x)`, NOT a
--- a/crates/luminal_python/tests/test_llama3.py
+++ b/crates/luminal_python/tests/test_llama3.py
@@ -414,6 +414,71 @@ def test_dynamic_dim_reuse_no_recompile(device: torch.device):
        )


+def test_hf_llama3_8b_instruct_1layer(device: torch.device):
+    """HuggingFace LlamaForCausalLM — Llama-3-8B-Instruct architecture, 1 layer, random weights.
+
+    Uses the exact model architecture from the TTFT benchmark
+    (NousResearch/Meta-Llama-3-8B-Instruct) with num_hidden_layers=1. Full 8B width:
+    4096 hidden, 32 attn heads, 8 KV heads, 14336 intermediate, 128256 vocab.
+    Random weights — tests that compilation and execution complete without error.
+
+    Regression for: NativeRuntime panic 'no entry found for key' (hlir.rs:2239) when the
+    wheel is built without --features cuda. The CUDA factory capsule silently falls back
+    to NativeRuntime, which cannot process GPU-resident weight device pointers, leaving
+    Output-node predecessor buffers unpopulated.
+    """
+    from transformers import AutoConfig, LlamaForCausalLM
+
+    config = AutoConfig.from_pretrained("NousResearch/Meta-Llama-3-8B-Instruct")
+    config.num_hidden_layers = 1
+    config.use_cache = False
+    config._attn_implementation = "eager"
+
+    model = LlamaForCausalLM(config).eval().to(device)
+    compiled = torch.compile(model, backend=luminal_backend)
+    input_ids = torch.tensor([[1, 2, 3, 4]], device=device)
+    with torch.no_grad():
+        ref = model(input_ids)
+        out = compiled(input_ids)
+    assert torch.allclose(out.logits, ref.logits, atol=1e-4), (
+        f"max_diff={torch.max(torch.abs(out.logits - ref.logits)).item():.2e}"
+    )
+
+
+@pytest.mark.slow
+@pytest.mark.xfail(reason="numerical precision — max_diff exceeds atol at full 8B scale")
+def test_hf_llama3_8b_instruct_full(device: torch.device):
+    """HuggingFace LlamaForCausalLM — full Llama-3-8B-Instruct with real pretrained weights.
+
+    Direct reproduction of the TTFT benchmark scenario. All 32 layers at full width.
+    Loads actual weights from NousResearch/Meta-Llama-3-8B-Instruct (~30 GB in fp32).
+    Marked slow (requires model download) and xfail (numerical precision at this scale).
+    """
+    from transformers import AutoConfig, LlamaForCausalLM
+
+    config = AutoConfig.from_pretrained("NousResearch/Meta-Llama-3-8B-Instruct")
+    config.use_cache = False
+    config._attn_implementation = "eager"
+
+    model = (
+        LlamaForCausalLM.from_pretrained(
+            "NousResearch/Meta-Llama-3-8B-Instruct",
+            config=config,
+            torch_dtype=torch.float32,
+        )
+        .eval()
+        .to(device)
+    )
+    compiled = torch.compile(model, backend=luminal_backend)
+    input_ids = torch.tensor([[1, 2, 3, 4]], device=device)
+    with torch.no_grad():
+        ref = model(input_ids)
+        out = compiled(input_ids)
+    assert torch.allclose(out.logits, ref.logits, atol=1e-5), (
+        f"max_diff={torch.max(torch.abs(out.logits - ref.logits)).item():.2e}"
+    )
+
+
@pytest.mark.xfail(reason="numerical precision — max_diff exceeds atol")
 def test_hf_llama38b_full(device: torch.device):
    """HuggingFace LlamaForCausalLM — full Llama-3.1-8B-Instruct with real pretrained weights.
--- a/crates/luminal_python/tests/test_models.py
+++ b/crates/luminal_python/tests/test_models.py
@@ -1752,6 +1752,22 @@ class ScatterNDTestModel(torch.nn.Module):
        return result


+class IndexPutOptionalModel(torch.nn.Module):
+    """Tests index_put with optional (None) indices — mirrors StaticCache KV update.
+
+    result[:, :, pos, :] = ones  →  index_put([None, None, pos_tensor, (implied None)], ones)
+    Input: (2, 2, 8, 4)  Output: same shape with dim-2 position 0 set to 1.
+    Batch size > 1 is required so PT2 preserves the full rank of the values tensor.
+    """
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        pos = torch.zeros(1, dtype=torch.long, device=x.device)
+        v = torch.ones(2, 2, 1, 4, device=x.device)
+        result = x.clone()
+        result[:, :, pos, :] = v
+        return result
+
+
 # ========== Llama3 Component Test Models ==========


--- a/examples/gemma/src/main.rs
+++ b/examples/gemma/src/main.rs
@@ -13,11 +13,21 @@ use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt};

 const REPO_ID: &str = "unsloth/gemma-3-4b-it";

+// Default configuration — override at runtime via env vars.
+const DEFAULT_MAX_SEQ_LEN:   usize = 4096;
+const DEFAULT_SEARCH_GRAPHS: usize = 50;
+const DEFAULT_GEN_TOKENS:    usize = 500;
+
+fn env_usize(name: &str, default: usize) -> usize {
+    std::env::var(name).ok().and_then(|s| s.parse().ok()).unwrap_or(default)
+}
+
 fn main() {
-    let max_seq_len = 4096;
-    let gen_tokens = 500;
-    let search_graphs = 500;
-    let prompt = "Explain what a neural network is in simple terms:";
+    let max_seq_len    = env_usize("MAX_SEQ_LEN",   DEFAULT_MAX_SEQ_LEN);
+    let gen_tokens     = env_usize("GEN_TOKENS",    DEFAULT_GEN_TOKENS);
+    let search_graphs  = env_usize("SEARCH_GRAPHS", DEFAULT_SEARCH_GRAPHS);
+    let prompt = std::env::var("PROMPT")
+        .unwrap_or_else(|_| "Explain what a neural network is in simple terms:".to_string());

    tracing_subscriber::registry()
        .with(tracing_subscriber::fmt::layer())
@@ -46,6 +56,7 @@ fn main() {
    }

    println!("Building E-Graph...");
+    let compile_start = std::time::Instant::now();
    cx.build_search_space::<CudaRuntime>();

    println!("Loading weights...");
@@ -65,36 +76,90 @@ fn main() {
    runtime.set_data(input, vec![1]);
    runtime.set_data(token_ids, vec![1]);
    runtime = cx.search(runtime, search_graphs);
+    println!("  COMPILE: {:.2} ms", compile_start.elapsed().as_secs_f64() * 1e3);

    for i in 0..LAYERS {
        runtime.set_zeros(kv_cache.k_caches[i], cache_bytes);
        runtime.set_zeros(kv_cache.v_caches[i], cache_bytes);
    }

+    // Full-prompt warmup: run the complete prompt to bring the GPU to steady state before timing
+    for (w_step, &w_token) in prompt_tokens.iter().enumerate() {
+        let p = w_step + 1;
+        cx.set_dim('s', 1);
+        cx.set_dim('p', p);
+        runtime.set_data(input, vec![w_token as i32]);
+        runtime.set_data(token_ids, vec![p as i32]);
+        runtime.execute(&cx.dyn_map);
+        for (layer_idx, (k_out, v_out)) in cache_outputs.iter().enumerate() {
+            let k_buf = runtime.remove_buffer(*k_out);
+            let v_buf = runtime.remove_buffer(*v_out);
+            runtime.set_buffer(kv_cache.k_caches[layer_idx], k_buf);
+            runtime.set_buffer(kv_cache.v_caches[layer_idx], v_buf);
+        }
+    }
+    for i in 0..LAYERS {
+        runtime.set_zeros(kv_cache.k_caches[i], cache_bytes);
+        runtime.set_zeros(kv_cache.v_caches[i], cache_bytes);
+    }
+
+    let iters       = env_usize("ITERS", 3);
+    let prompt_len  = prompt_tokens.len();
+
+    println!("Prompt: {} tokens, generating up to {} tokens", prompt_len, gen_tokens);
+
+    // ── TTFT: prefill-only timing over N iterations ───────────────────────
+    let mut ttft_samples_ms: Vec<f64> = vec![];
+    for _ in 0..iters {
+        for i in 0..LAYERS {
+            runtime.set_zeros(kv_cache.k_caches[i], cache_bytes);
+            runtime.set_zeros(kv_cache.v_caches[i], cache_bytes);
+        }
+        let mut prev_seq = 1usize;
+        let mut step_times = vec![];
+        for step in 0..prompt_len {
+            cx.set_dim('s', 1);
+            cx.set_dim('p', prev_seq);
+            runtime.set_data(input, vec![prompt_tokens[step] as i32]);
+            runtime.set_data(token_ids, vec![prev_seq as i32]);
+            let t = std::time::Instant::now();
+            runtime.execute(&cx.dyn_map);
+            let _ = runtime.get_f32(logits);
+            step_times.push(t.elapsed());
+            for (layer_idx, (k_out, v_out)) in cache_outputs.iter().enumerate() {
+                let k_buf = runtime.remove_buffer(*k_out);
+                let v_buf = runtime.remove_buffer(*v_out);
+                runtime.set_buffer(kv_cache.k_caches[layer_idx], k_buf);
+                runtime.set_buffer(kv_cache.v_caches[layer_idx], v_buf);
+            }
+            prev_seq += 1;
+        }
+        ttft_samples_ms.push(step_times.iter().sum::<Duration>().as_secs_f64() * 1e3);
+    }
+    ttft_samples_ms.sort_by(|a, b| a.partial_cmp(b).unwrap());
+    let ttft_ms = ttft_samples_ms[ttft_samples_ms.len() / 2];
+
+    // ── Text generation: one pass for TPOT + visible output ───────────────
+    for i in 0..LAYERS {
+        runtime.set_zeros(kv_cache.k_caches[i], cache_bytes);
+        runtime.set_zeros(kv_cache.v_caches[i], cache_bytes);
+    }
    let mut prev_seq = 1usize;
    let mut sentence = vec![prompt_tokens[0]];
-    let total_steps = prompt_tokens.len() - 1 + gen_tokens;
-    let prompt_len = prompt_tokens.len();
-    let mut fwd_durations = vec![];
+    let total_steps = prompt_len - 1 + gen_tokens;
+    let mut decode_step_times: Vec<Duration> = vec![];
    let mut seen_tokens = FxHashSet::default();
    let repetition_penalty: f32 = 1.05;

    const EOS_TOKEN: u32 = 1;
    const STOP_TOKEN: u32 = 107;

-    println!(
-        "Prompt: {} tokens, generating up to {} tokens",
-        prompt_len, gen_tokens
-    );
-
    for i in 0..total_steps {
-        let start = std::time::Instant::now();
        let is_prefill = i < prompt_len - 1;
        let seq_len = sentence.len();

        cx.set_dim('s', seq_len);
        cx.set_dim('p', prev_seq);
-
        runtime.set_data(
            input,
            sentence.iter().map(|t| *t as i32).collect::<Vec<_>>(),
@@ -104,26 +169,26 @@ fn main() {
            (prev_seq as i32..(seq_len + prev_seq) as i32).collect::<Vec<_>>(),
        );

+        let step_start = std::time::Instant::now();
        runtime.execute(&cx.dyn_map);
        let logits_data = runtime.get_f32(logits);
+        let step_elapsed = step_start.elapsed();

-        // Round-trip KV cache
        for (layer_idx, (k_out, v_out)) in cache_outputs.iter().enumerate() {
            let k_buf = runtime.remove_buffer(*k_out);
            let v_buf = runtime.remove_buffer(*v_out);
            runtime.set_buffer(kv_cache.k_caches[layer_idx], k_buf);
            runtime.set_buffer(kv_cache.v_caches[layer_idx], v_buf);
        }
-
        prev_seq += seq_len;
-        fwd_durations.push(start.elapsed());

        if is_prefill {
            sentence = vec![prompt_tokens[i + 1]];
            continue;
        }

-        // Greedy decode with repetition penalty
+        decode_step_times.push(step_elapsed);
+
        let mut last_row = logits_data[logits_data.len() - VOCAB_SIZE..].to_vec();
        for &tok in &seen_tokens {
            let logit = &mut last_row[tok as usize];
@@ -152,21 +217,13 @@ fn main() {
    }
    println!();

-    // Benchmarks
-    let decode_durations: Vec<_> = fwd_durations.iter().skip(prompt_len).collect();
-    if decode_durations.len() > 2 {
-        println!(
-            "  TTFT: {:.2} ms",
-            fwd_durations[..prompt_len]
-                .iter()
-                .sum::<Duration>()
-                .as_secs_f64()
-                * 1e3
-        );
+    // ── Report ────────────────────────────────────────────────────────────
+    println!("  TTFT: {:.2} ms", ttft_ms);
+    if decode_step_times.len() > 1 {
        println!(
            "  TPOT: {:.2} ms",
-            (decode_durations.iter().skip(1).copied().sum::<Duration>()
-                / (decode_durations.len() - 1) as u32)
+            (decode_step_times.iter().skip(1).sum::<Duration>()
+                / (decode_step_times.len() - 1) as u32)
                .as_secs_f64()
                * 1_000.
        );
--- a/examples/gemma4_moe/src/main.rs
+++ b/examples/gemma4_moe/src/main.rs
@@ -56,6 +56,7 @@ fn main() {
    }

    println!("Building E-Graph...");
+    let compile_start = std::time::Instant::now();
    cx.build_search_space::<CudaRuntime>();

    println!("Loading weights...");
@@ -75,6 +76,7 @@ fn main() {
    runtime.set_data(input, vec![1]);
    runtime.set_data(pos_ids, vec![1]);
    runtime = cx.search(runtime, search_graphs);
+    println!("  COMPILE: {:.2} ms", compile_start.elapsed().as_secs_f64() * 1e3);

    for layer in 0..LAYERS {
        let cache_bytes = cache_bytes_for_layer(layer, max_seq_len);
@@ -82,6 +84,64 @@ fn main() {
        runtime.set_zeros(kv_cache.v_caches[layer], cache_bytes);
    }

+    // Full-prompt warmup: run the complete prompt to bring the GPU to steady state before timing
+    for (w_pos, &w_token) in prompt_tokens.iter().enumerate() {
+        cx.set_dim('s', 1);
+        cx.set_dim('p', w_pos);
+        runtime.set_data(input, vec![w_token as i32]);
+        runtime.set_data(pos_ids, vec![w_pos as i32]);
+        runtime.execute(&cx.dyn_map);
+        for (layer_idx, (k_out, v_out)) in cache_outputs.iter().enumerate() {
+            let k_buf = runtime.remove_buffer(*k_out);
+            let v_buf = runtime.remove_buffer(*v_out);
+            runtime.set_buffer(kv_cache.k_caches[layer_idx], k_buf);
+            runtime.set_buffer(kv_cache.v_caches[layer_idx], v_buf);
+        }
+    }
+    for layer in 0..LAYERS {
+        let cache_bytes = cache_bytes_for_layer(layer, max_seq_len);
+        runtime.set_zeros(kv_cache.k_caches[layer], cache_bytes);
+        runtime.set_zeros(kv_cache.v_caches[layer], cache_bytes);
+    }
+
+    let iters = env_usize("ITERS", 3);
+    let prompt_len = prompt_tokens.len();
+
+    // ── TTFT: prefill-only timing over N iterations ───────────────────────
+    let mut ttft_samples_ms: Vec<f64> = vec![];
+    for _ in 0..iters {
+        for layer in 0..LAYERS {
+            let cache_bytes = cache_bytes_for_layer(layer, max_seq_len);
+            runtime.set_zeros(kv_cache.k_caches[layer], cache_bytes);
+            runtime.set_zeros(kv_cache.v_caches[layer], cache_bytes);
+        }
+        let prefill_start = std::time::Instant::now();
+        let mut prev_seq = 0usize;
+        for &token in &prompt_tokens {
+            cx.set_dim('s', 1);
+            cx.set_dim('p', prev_seq);
+            runtime.set_data(input, vec![token as i32]);
+            runtime.set_data(pos_ids, vec![prev_seq as i32]);
+            runtime.execute(&cx.dyn_map);
+            for (layer_idx, (k_out, v_out)) in cache_outputs.iter().enumerate() {
+                let k_buf = runtime.remove_buffer(*k_out);
+                let v_buf = runtime.remove_buffer(*v_out);
+                runtime.set_buffer(kv_cache.k_caches[layer_idx], k_buf);
+                runtime.set_buffer(kv_cache.v_caches[layer_idx], v_buf);
+            }
+            prev_seq += 1;
+        }
+        ttft_samples_ms.push(prefill_start.elapsed().as_secs_f64() * 1e3);
+    }
+    ttft_samples_ms.sort_by(|a, b| a.partial_cmp(b).unwrap());
+    let ttft_ms = ttft_samples_ms[ttft_samples_ms.len() / 2];
+
+    // ── Text generation: one pass for TPOT + visible output ───────────────
+    for layer in 0..LAYERS {
+        let cache_bytes = cache_bytes_for_layer(layer, max_seq_len);
+        runtime.set_zeros(kv_cache.k_caches[layer], cache_bytes);
+        runtime.set_zeros(kv_cache.v_caches[layer], cache_bytes);
+    }
    print!("{prompt}");
    std::io::stdout().flush().unwrap();

@@ -93,24 +153,20 @@ fn main() {

    const EOS_TOKEN: u32 = 1;

-    let prefill_start = std::time::Instant::now();
    for &token in &prompt_tokens {
        cx.set_dim('s', 1);
        cx.set_dim('p', prev_seq);
        runtime.set_data(input, vec![token as i32]);
        runtime.set_data(pos_ids, vec![prev_seq as i32]);
        runtime.execute(&cx.dyn_map);
-
        for (layer_idx, (k_out, v_out)) in cache_outputs.iter().enumerate() {
            let k_buf = runtime.remove_buffer(*k_out);
            let v_buf = runtime.remove_buffer(*v_out);
            runtime.set_buffer(kv_cache.k_caches[layer_idx], k_buf);
            runtime.set_buffer(kv_cache.v_caches[layer_idx], v_buf);
        }
-
        prev_seq += 1;
    }
-    let prefill_duration = prefill_start.elapsed();

    let logits_data = runtime.get_f32(logits);
    let last_row = &logits_data[..VOCAB_SIZE];
@@ -139,7 +195,6 @@ fn main() {
            runtime.set_buffer(kv_cache.k_caches[layer_idx], k_buf);
            runtime.set_buffer(kv_cache.v_caches[layer_idx], v_buf);
        }
-
        prev_seq += 1;

        let logits_data = runtime.get_f32(logits);
@@ -174,10 +229,10 @@ fn main() {
        println!("Generated token ids: {generated_token_ids:?}");
    }

+    // ── Report ────────────────────────────────────────────────────────────
    println!(
        "  TTFT: {:.2} ms ({} prompt tokens)",
-        prefill_duration.as_secs_f64() * 1e3,
-        prompt_tokens.len()
+        ttft_ms, prompt_len
    );
    if fwd_durations.len() > 1 {
        println!(
--- a/examples/llama/src/main.rs
+++ b/examples/llama/src/main.rs
@@ -13,11 +13,21 @@ use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt};

 const REPO_ID: &str = "NousResearch/Meta-Llama-3-8B-Instruct";

+// Default configuration — override at runtime via env vars.
+const DEFAULT_MAX_SEQ_LEN:   usize = 4096;
+const DEFAULT_SEARCH_GRAPHS: usize = 500;
+const DEFAULT_GEN_TOKENS:    usize = 500;
+
+fn env_usize(name: &str, default: usize) -> usize {
+    std::env::var(name).ok().and_then(|s| s.parse().ok()).unwrap_or(default)
+}
+
 fn main() {
-    let max_seq_len = 4096;
-    let gen_tokens = 500;
-    let search_graphs = 500;
-    let prompt = "Explain what a neural network is in a paragraph.";
+    let max_seq_len    = env_usize("MAX_SEQ_LEN",   DEFAULT_MAX_SEQ_LEN);
+    let gen_tokens     = env_usize("GEN_TOKENS",    DEFAULT_GEN_TOKENS);
+    let search_graphs  = env_usize("SEARCH_GRAPHS", DEFAULT_SEARCH_GRAPHS);
+    let prompt = std::env::var("PROMPT")
+        .unwrap_or_else(|_| "Explain what a neural network is in a paragraph.".to_string());

    tracing_subscriber::registry()
        .with(tracing_subscriber::fmt::layer())
@@ -53,6 +63,7 @@ fn main() {
    }

    println!("Building E-Graph...");
+    let compile_start = std::time::Instant::now();
    cx.build_search_space::<CudaRuntime>();

    println!("Loading weights...");
@@ -72,36 +83,90 @@ fn main() {
    runtime.set_data(input, vec![1]);
    runtime.set_data(token_ids, vec![1]);
    runtime = cx.search(runtime, search_graphs);
+    println!("  COMPILE: {:.2} ms", compile_start.elapsed().as_secs_f64() * 1e3);

    for i in 0..LAYERS {
        runtime.set_zeros(kv_cache.k_caches[i], cache_bytes);
        runtime.set_zeros(kv_cache.v_caches[i], cache_bytes);
    }

+    // Full-prompt warmup: run the complete prompt to bring the GPU to steady state before timing
+    for (w_step, &w_token) in prompt_tokens.iter().enumerate() {
+        let p = w_step + 1;
+        cx.set_dim('s', 1);
+        cx.set_dim('p', p);
+        runtime.set_data(input, vec![w_token as i32]);
+        runtime.set_data(token_ids, vec![p as i32]);
+        runtime.execute(&cx.dyn_map);
+        for (layer_idx, (k_out, v_out)) in cache_outputs.iter().enumerate() {
+            let k_buf = runtime.remove_buffer(*k_out);
+            let v_buf = runtime.remove_buffer(*v_out);
+            runtime.set_buffer(kv_cache.k_caches[layer_idx], k_buf);
+            runtime.set_buffer(kv_cache.v_caches[layer_idx], v_buf);
+        }
+    }
+    for i in 0..LAYERS {
+        runtime.set_zeros(kv_cache.k_caches[i], cache_bytes);
+        runtime.set_zeros(kv_cache.v_caches[i], cache_bytes);
+    }
+
+    let iters       = env_usize("ITERS", 3);
+    let prompt_len  = prompt_tokens.len();
+
+    println!("Prompt: {} tokens, generating up to {} tokens", prompt_len, gen_tokens);
+
+    // ── TTFT: prefill-only timing over N iterations ───────────────────────
+    let mut ttft_samples_ms: Vec<f64> = vec![];
+    for _ in 0..iters {
+        for i in 0..LAYERS {
+            runtime.set_zeros(kv_cache.k_caches[i], cache_bytes);
+            runtime.set_zeros(kv_cache.v_caches[i], cache_bytes);
+        }
+        let mut prev_seq = 1usize;
+        let mut step_times = vec![];
+        for step in 0..prompt_len {
+            cx.set_dim('s', 1);
+            cx.set_dim('p', prev_seq);
+            runtime.set_data(input, vec![prompt_tokens[step] as i32]);
+            runtime.set_data(token_ids, vec![prev_seq as i32]);
+            let t = std::time::Instant::now();
+            runtime.execute(&cx.dyn_map);
+            let _ = runtime.get_f32(logits);
+            step_times.push(t.elapsed());
+            for (layer_idx, (k_out, v_out)) in cache_outputs.iter().enumerate() {
+                let k_buf = runtime.remove_buffer(*k_out);
+                let v_buf = runtime.remove_buffer(*v_out);
+                runtime.set_buffer(kv_cache.k_caches[layer_idx], k_buf);
+                runtime.set_buffer(kv_cache.v_caches[layer_idx], v_buf);
+            }
+            prev_seq += 1;
+        }
+        ttft_samples_ms.push(step_times.iter().sum::<Duration>().as_secs_f64() * 1e3);
+    }
+    ttft_samples_ms.sort_by(|a, b| a.partial_cmp(b).unwrap());
+    let ttft_ms = ttft_samples_ms[ttft_samples_ms.len() / 2];
+
+    // ── Text generation: one pass for TPOT + visible output ───────────────
+    for i in 0..LAYERS {
+        runtime.set_zeros(kv_cache.k_caches[i], cache_bytes);
+        runtime.set_zeros(kv_cache.v_caches[i], cache_bytes);
+    }
    let mut prev_seq = 1usize;
    let mut sentence = vec![prompt_tokens[0]];
-    let total_steps = prompt_tokens.len() - 1 + gen_tokens;
-    let prompt_len = prompt_tokens.len();
-    let mut fwd_durations = vec![];
+    let total_steps = prompt_len - 1 + gen_tokens;
+    let mut decode_step_times: Vec<Duration> = vec![];
    let mut seen_tokens = FxHashSet::default();
    let repetition_penalty: f32 = 1.05;

    const EOS_TOKEN: u32 = 128009;
    const STOP_TOKEN: u32 = 128001;

-    println!(
-        "Prompt: {} tokens, generating up to {} tokens",
-        prompt_len, gen_tokens
-    );
-
    for i in 0..total_steps {
-        let start = std::time::Instant::now();
        let is_prefill = i < prompt_len - 1;
        let seq_len = sentence.len();

        cx.set_dim('s', seq_len);
        cx.set_dim('p', prev_seq);
-
        runtime.set_data(
            input,
            sentence.iter().map(|t| *t as i32).collect::<Vec<_>>(),
@@ -111,26 +176,26 @@ fn main() {
            (prev_seq as i32..(seq_len + prev_seq) as i32).collect::<Vec<_>>(),
        );

+        let step_start = std::time::Instant::now();
        runtime.execute(&cx.dyn_map);
        let logits_data = runtime.get_f32(logits);
+        let step_elapsed = step_start.elapsed();

-        // Round-trip KV cache
        for (layer_idx, (k_out, v_out)) in cache_outputs.iter().enumerate() {
            let k_buf = runtime.remove_buffer(*k_out);
            let v_buf = runtime.remove_buffer(*v_out);
            runtime.set_buffer(kv_cache.k_caches[layer_idx], k_buf);
            runtime.set_buffer(kv_cache.v_caches[layer_idx], v_buf);
        }
-
        prev_seq += seq_len;
-        fwd_durations.push(start.elapsed());

        if is_prefill {
            sentence = vec![prompt_tokens[i + 1]];
            continue;
        }

-        // Greedy decode with repetition penalty
+        decode_step_times.push(step_elapsed);
+
        let mut last_row = logits_data[logits_data.len() - VOCAB_SIZE..].to_vec();
        for &tok in &seen_tokens {
            let logit = &mut last_row[tok as usize];
@@ -159,21 +224,13 @@ fn main() {
    }
    println!();

-    // Benchmarks
-    let decode_durations: Vec<_> = fwd_durations.iter().skip(prompt_len).collect();
-    if decode_durations.len() > 2 {
-        println!(
-            "  TTFT: {:.2} ms",
-            fwd_durations[..prompt_len]
-                .iter()
-                .sum::<Duration>()
-                .as_secs_f64()
-                * 1e3
-        );
+    // ── Report ────────────────────────────────────────────────────────────
+    println!("  TTFT: {:.2} ms", ttft_ms);
+    if decode_step_times.len() > 1 {
        println!(
            "  TPOT: {:.2} ms",
-            (decode_durations.iter().skip(1).copied().sum::<Duration>()
-                / (decode_durations.len() - 1) as u32)
+            (decode_step_times.iter().skip(1).sum::<Duration>()
+                / (decode_step_times.len() - 1) as u32)
                .as_secs_f64()
                * 1_000.
        );
--- a/examples/qwen/src/main.rs
+++ b/examples/qwen/src/main.rs
@@ -13,11 +13,21 @@ use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt};

 const REPO_ID: &str = "Qwen/Qwen3-4B";

+// Default configuration — override at runtime via env vars.
+const DEFAULT_MAX_SEQ_LEN:   usize = 4096;
+const DEFAULT_SEARCH_GRAPHS: usize = 50;
+const DEFAULT_GEN_TOKENS:    usize = 500;
+
+fn env_usize(name: &str, default: usize) -> usize {
+    std::env::var(name).ok().and_then(|s| s.parse().ok()).unwrap_or(default)
+}
+
 fn main() {
-    let max_seq_len = 4096;
-    let gen_tokens = 500;
-    let search_graphs = 500;
-    let prompt = "Explain what a neural network is in a paragraph.";
+    let max_seq_len    = env_usize("MAX_SEQ_LEN",   DEFAULT_MAX_SEQ_LEN);
+    let gen_tokens     = env_usize("GEN_TOKENS",    DEFAULT_GEN_TOKENS);
+    let search_graphs  = env_usize("SEARCH_GRAPHS", DEFAULT_SEARCH_GRAPHS);
+    let prompt = std::env::var("PROMPT")
+        .unwrap_or_else(|_| "Explain what a neural network is in a paragraph.".to_string());

    tracing_subscriber::registry()
        .with(tracing_subscriber::fmt::layer())
@@ -46,6 +56,7 @@ fn main() {
    }

    println!("Building E-Graph...");
+    let compile_start = std::time::Instant::now();
    cx.build_search_space::<CudaRuntime>();

    println!("Loading weights...");
@@ -65,36 +76,90 @@ fn main() {
    runtime.set_data(input, vec![1]);
    runtime.set_data(token_ids, vec![1]);
    runtime = cx.search(runtime, search_graphs);
+    println!("  COMPILE: {:.2} ms", compile_start.elapsed().as_secs_f64() * 1e3);

    for i in 0..LAYERS {
        runtime.set_zeros(kv_cache.k_caches[i], cache_bytes);
        runtime.set_zeros(kv_cache.v_caches[i], cache_bytes);
    }

+    // Full-prompt warmup: run the complete prompt to bring the GPU to steady state before timing
+    for (w_step, &w_token) in prompt_tokens.iter().enumerate() {
+        let p = w_step + 1;
+        cx.set_dim('s', 1);
+        cx.set_dim('p', p);
+        runtime.set_data(input, vec![w_token as i32]);
+        runtime.set_data(token_ids, vec![p as i32]);
+        runtime.execute(&cx.dyn_map);
+        for (layer_idx, (k_out, v_out)) in cache_outputs.iter().enumerate() {
+            let k_buf = runtime.remove_buffer(*k_out);
+            let v_buf = runtime.remove_buffer(*v_out);
+            runtime.set_buffer(kv_cache.k_caches[layer_idx], k_buf);
+            runtime.set_buffer(kv_cache.v_caches[layer_idx], v_buf);
+        }
+    }
+    for i in 0..LAYERS {
+        runtime.set_zeros(kv_cache.k_caches[i], cache_bytes);
+        runtime.set_zeros(kv_cache.v_caches[i], cache_bytes);
+    }
+
+    let iters       = env_usize("ITERS", 3);
+    let prompt_len  = prompt_tokens.len();
+
+    println!("Prompt: {} tokens, generating up to {} tokens", prompt_len, gen_tokens);
+
+    // ── TTFT: prefill-only timing over N iterations ───────────────────────
+    let mut ttft_samples_ms: Vec<f64> = vec![];
+    for _ in 0..iters {
+        for i in 0..LAYERS {
+            runtime.set_zeros(kv_cache.k_caches[i], cache_bytes);
+            runtime.set_zeros(kv_cache.v_caches[i], cache_bytes);
+        }
+        let mut prev_seq = 1usize;
+        let mut step_times = vec![];
+        for step in 0..prompt_len {
+            cx.set_dim('s', 1);
+            cx.set_dim('p', prev_seq);
+            runtime.set_data(input, vec![prompt_tokens[step] as i32]);
+            runtime.set_data(token_ids, vec![prev_seq as i32]);
+            let t = std::time::Instant::now();
+            runtime.execute(&cx.dyn_map);
+            let _ = runtime.get_f32(logits);
+            step_times.push(t.elapsed());
+            for (layer_idx, (k_out, v_out)) in cache_outputs.iter().enumerate() {
+                let k_buf = runtime.remove_buffer(*k_out);
+                let v_buf = runtime.remove_buffer(*v_out);
+                runtime.set_buffer(kv_cache.k_caches[layer_idx], k_buf);
+                runtime.set_buffer(kv_cache.v_caches[layer_idx], v_buf);
+            }
+            prev_seq += 1;
+        }
+        ttft_samples_ms.push(step_times.iter().sum::<Duration>().as_secs_f64() * 1e3);
+    }
+    ttft_samples_ms.sort_by(|a, b| a.partial_cmp(b).unwrap());
+    let ttft_ms = ttft_samples_ms[ttft_samples_ms.len() / 2];
+
+    // ── Text generation: one pass for TPOT + visible output ───────────────
+    for i in 0..LAYERS {
+        runtime.set_zeros(kv_cache.k_caches[i], cache_bytes);
+        runtime.set_zeros(kv_cache.v_caches[i], cache_bytes);
+    }
    let mut prev_seq = 1usize;
    let mut sentence = vec![prompt_tokens[0]];
-    let total_steps = prompt_tokens.len() - 1 + gen_tokens;
-    let prompt_len = prompt_tokens.len();
-    let mut fwd_durations = vec![];
+    let total_steps = prompt_len - 1 + gen_tokens;
+    let mut decode_step_times: Vec<Duration> = vec![];
    let mut seen_tokens = FxHashSet::default();
    let repetition_penalty: f32 = 1.05;

    const EOS_TOKEN: u32 = 151645; // <|endoftext|>
    const STOP_TOKEN: u32 = 151643; // <|end|>

-    println!(
-        "Prompt: {} tokens, generating up to {} tokens",
-        prompt_len, gen_tokens
-    );
-
    for i in 0..total_steps {
-        let start = std::time::Instant::now();
        let is_prefill = i < prompt_len - 1;
        let seq_len = sentence.len();

        cx.set_dim('s', seq_len);
        cx.set_dim('p', prev_seq);
-
        runtime.set_data(
            input,
            sentence.iter().map(|t| *t as i32).collect::<Vec<_>>(),
@@ -104,26 +169,26 @@ fn main() {
            (prev_seq as i32..(seq_len + prev_seq) as i32).collect::<Vec<_>>(),
        );

+        let step_start = std::time::Instant::now();
        runtime.execute(&cx.dyn_map);
        let logits_data = runtime.get_f32(logits);
+        let step_elapsed = step_start.elapsed();

-        // Round-trip KV cache
        for (layer_idx, (k_out, v_out)) in cache_outputs.iter().enumerate() {
            let k_buf = runtime.remove_buffer(*k_out);
            let v_buf = runtime.remove_buffer(*v_out);
            runtime.set_buffer(kv_cache.k_caches[layer_idx], k_buf);
            runtime.set_buffer(kv_cache.v_caches[layer_idx], v_buf);
        }
-
        prev_seq += seq_len;
-        fwd_durations.push(start.elapsed());

        if is_prefill {
            sentence = vec![prompt_tokens[i + 1]];
            continue;
        }

-        // Greedy decode with repetition penalty
+        decode_step_times.push(step_elapsed);
+
        let mut last_row = logits_data[logits_data.len() - VOCAB_SIZE..].to_vec();
        for &tok in &seen_tokens {
            let logit = &mut last_row[tok as usize];
@@ -152,21 +217,13 @@ fn main() {
    }
    println!();

-    // Benchmarks
-    let decode_durations: Vec<_> = fwd_durations.iter().skip(prompt_len).collect();
-    if decode_durations.len() > 2 {
-        println!(
-            "  TTFT: {:.2} ms",
-            fwd_durations[..prompt_len]
-                .iter()
-                .sum::<Duration>()
-                .as_secs_f64()
-                * 1e3
-        );
+    // ── Report ────────────────────────────────────────────────────────────
+    println!("  TTFT: {:.2} ms", ttft_ms);
+    if decode_step_times.len() > 1 {
        println!(
            "  TPOT: {:.2} ms",
-            (decode_durations.iter().skip(1).copied().sum::<Duration>()
-                / (decode_durations.len() - 1) as u32)
+            (decode_step_times.iter().skip(1).sum::<Duration>()
+                / (decode_step_times.len() - 1) as u32)
                .as_secs_f64()
                * 1_000.
        );
--- a/examples/qwen3_moe/src/main.rs
+++ b/examples/qwen3_moe/src/main.rs
@@ -11,11 +11,21 @@ use tokenizers::Tokenizer;

 const REPO_ID: &str = "Qwen/Qwen3-30B-A3B";

+// Default configuration — override at runtime via env vars.
+const DEFAULT_MAX_SEQ_LEN:   usize = 4096;
+const DEFAULT_SEARCH_GRAPHS: usize = 50;
+const DEFAULT_GEN_TOKENS:    usize = 30;
+
+fn env_usize(name: &str, default: usize) -> usize {
+    std::env::var(name).ok().and_then(|s| s.parse().ok()).unwrap_or(default)
+}
+
 fn main() {
-    let max_seq_len = 4096;
-    let gen_tokens = 30;
-    let search_graphs = 50;
-    let prompt = "The capital of France is";
+    let max_seq_len    = env_usize("MAX_SEQ_LEN",   DEFAULT_MAX_SEQ_LEN);
+    let gen_tokens     = env_usize("GEN_TOKENS",    DEFAULT_GEN_TOKENS);
+    let search_graphs  = env_usize("SEARCH_GRAPHS", DEFAULT_SEARCH_GRAPHS);
+    let prompt = std::env::var("PROMPT")
+        .unwrap_or_else(|_| "The capital of France is".to_string());

    let ctx = CudaContext::new(0).unwrap();
    let stream = ctx.default_stream();
@@ -24,7 +34,7 @@ fn main() {
    println!("Using model directory: {}", model_dir.display());

    let tokenizer = Tokenizer::from_file(model_dir.join("tokenizer.json")).unwrap();
-    let prompt_tokens = tokenizer.encode(prompt, true).unwrap().get_ids().to_vec();
+    let prompt_tokens = tokenizer.encode(prompt.as_str(), true).unwrap().get_ids().to_vec();

    // Build graph
    let mut cx = Graph::default();
@@ -39,6 +49,7 @@ fn main() {
    }

    println!("Building E-Graph...");
+    let compile_start = std::time::Instant::now();
    cx.build_search_space::<CudaRuntime>();

    println!("Loading weights...");
@@ -58,12 +69,68 @@ fn main() {
    runtime.set_data(input, vec![1]);
    runtime.set_data(pos_ids, vec![1]);
    runtime = cx.search(runtime, search_graphs);
+    println!("  COMPILE: {:.2} ms", compile_start.elapsed().as_secs_f64() * 1e3);

    for i in 0..LAYERS {
        runtime.set_zeros(kv_cache.k_caches[i], cache_bytes);
        runtime.set_zeros(kv_cache.v_caches[i], cache_bytes);
    }

+    // Full-prompt warmup: run the complete prompt to bring the GPU to steady state before timing
+    for (w_pos, &w_token) in prompt_tokens.iter().enumerate() {
+        cx.set_dim('s', 1);
+        cx.set_dim('p', w_pos);
+        runtime.set_data(input, vec![w_token as i32]);
+        runtime.set_data(pos_ids, vec![w_pos as i32]);
+        runtime.execute(&cx.dyn_map);
+        for (layer_idx, (k_out, v_out)) in cache_outputs.iter().enumerate() {
+            let k_buf = runtime.remove_buffer(*k_out);
+            let v_buf = runtime.remove_buffer(*v_out);
+            runtime.set_buffer(kv_cache.k_caches[layer_idx], k_buf);
+            runtime.set_buffer(kv_cache.v_caches[layer_idx], v_buf);
+        }
+    }
+    for i in 0..LAYERS {
+        runtime.set_zeros(kv_cache.k_caches[i], cache_bytes);
+        runtime.set_zeros(kv_cache.v_caches[i], cache_bytes);
+    }
+
+    let iters      = env_usize("ITERS", 3);
+    let prompt_len = prompt_tokens.len();
+
+    // ── TTFT: prefill-only timing over N iterations ───────────────────────
+    let mut ttft_samples_ms: Vec<f64> = vec![];
+    for _ in 0..iters {
+        for i in 0..LAYERS {
+            runtime.set_zeros(kv_cache.k_caches[i], cache_bytes);
+            runtime.set_zeros(kv_cache.v_caches[i], cache_bytes);
+        }
+        let prefill_start = std::time::Instant::now();
+        let mut prev_seq = 0usize;
+        for &token in &prompt_tokens {
+            cx.set_dim('s', 1);
+            cx.set_dim('p', prev_seq);
+            runtime.set_data(input, vec![token as i32]);
+            runtime.set_data(pos_ids, vec![prev_seq as i32]);
+            runtime.execute(&cx.dyn_map);
+            for (layer_idx, (k_out, v_out)) in cache_outputs.iter().enumerate() {
+                let k_buf = runtime.remove_buffer(*k_out);
+                let v_buf = runtime.remove_buffer(*v_out);
+                runtime.set_buffer(kv_cache.k_caches[layer_idx], k_buf);
+                runtime.set_buffer(kv_cache.v_caches[layer_idx], v_buf);
+            }
+            prev_seq += 1;
+        }
+        ttft_samples_ms.push(prefill_start.elapsed().as_secs_f64() * 1e3);
+    }
+    ttft_samples_ms.sort_by(|a, b| a.partial_cmp(b).unwrap());
+    let ttft_ms = ttft_samples_ms[ttft_samples_ms.len() / 2];
+
+    // ── Text generation: one pass for TPOT + visible output ───────────────
+    for i in 0..LAYERS {
+        runtime.set_zeros(kv_cache.k_caches[i], cache_bytes);
+        runtime.set_zeros(kv_cache.v_caches[i], cache_bytes);
+    }
    print!("{prompt}");
    std::io::stdout().flush().unwrap();

@@ -75,28 +142,21 @@ fn main() {
    const EOS_TOKEN: u32 = 151645;
    const STOP_TOKEN: u32 = 151643;

-    // Prefill: process prompt tokens one at a time
-    let prefill_start = std::time::Instant::now();
    for &token in &prompt_tokens {
        cx.set_dim('s', 1);
        cx.set_dim('p', prev_seq);
        runtime.set_data(input, vec![token as i32]);
        runtime.set_data(pos_ids, vec![prev_seq as i32]);
        runtime.execute(&cx.dyn_map);
-
-        // Round-trip KV cache
        for (layer_idx, (k_out, v_out)) in cache_outputs.iter().enumerate() {
            let k_buf = runtime.remove_buffer(*k_out);
            let v_buf = runtime.remove_buffer(*v_out);
            runtime.set_buffer(kv_cache.k_caches[layer_idx], k_buf);
            runtime.set_buffer(kv_cache.v_caches[layer_idx], v_buf);
        }
-
        prev_seq += 1;
    }
-    let prefill_duration = prefill_start.elapsed();

-    // Get logits from last prefill step and sample first new token
    let logits_data = runtime.get_f32(logits);
    let last_row = &logits_data[..VOCAB_SIZE];
    let mut next_token = last_row
@@ -109,7 +169,6 @@ fn main() {
    std::io::stdout().flush().unwrap();
    seen_tokens.insert(next_token);

-    // Decode loop
    for _ in 1..gen_tokens {
        let start = std::time::Instant::now();
        cx.set_dim('s', 1);
@@ -117,15 +176,12 @@ fn main() {
        runtime.set_data(input, vec![next_token as i32]);
        runtime.set_data(pos_ids, vec![prev_seq as i32]);
        runtime.execute(&cx.dyn_map);
-
-        // Round-trip KV cache
        for (layer_idx, (k_out, v_out)) in cache_outputs.iter().enumerate() {
            let k_buf = runtime.remove_buffer(*k_out);
            let v_buf = runtime.remove_buffer(*v_out);
            runtime.set_buffer(kv_cache.k_caches[layer_idx], k_buf);
            runtime.set_buffer(kv_cache.v_caches[layer_idx], v_buf);
        }
-
        prev_seq += 1;

        let logits_data = runtime.get_f32(logits);
@@ -156,11 +212,10 @@ fn main() {
    }
    println!();

-    // Report benchmarks
+    // ── Report ────────────────────────────────────────────────────────────
    println!(
        "  TTFT: {:.2} ms ({} prompt tokens)",
-        prefill_duration.as_secs_f64() * 1e3,
-        prompt_tokens.len()
+        ttft_ms, prompt_len
    );
    if fwd_durations.len() > 1 {
        println!(
--- a/src/hlir.rs
+++ b/src/hlir.rs
@@ -2946,13 +2946,19 @@ impl Runtime for NativeRuntime {
            self.buffers.insert(node, output);
        }

-        // Consume all non-Output buffers (inputs + intermediates)
-        let output_nodes: FxHashSet<NodeIndex> = self
+        // Free intermediate computation buffers; keep Input (weights/user data) and Output nodes.
+        // Keeping Input buffers allows the graph to be called multiple times without re-loading
+        // weights. User inputs are re-set before each call via set_data, so stale values are
+        // overwritten. Weight inputs are set once and must survive across calls.
+        let keep_nodes: FxHashSet<NodeIndex> = self
            .graph
            .node_indices()
-            .filter(|n| (**self.graph[*n]).as_any().is::<Output>())
+            .filter(|n| {
+                (**self.graph[*n]).as_any().is::<Output>()
+                    || (**self.graph[*n]).as_any().is::<Input>()
+            })
            .collect();
-        self.buffers.retain(|k, _| output_nodes.contains(k));
+        self.buffers.retain(|k, _| keep_nodes.contains(k));
    }
 }

--- a/src/shape/mod.rs
+++ b/src/shape/mod.rs
@@ -7,7 +7,17 @@ pub use tracker::*;
 use std::ops::{Bound, Range, RangeBounds, RangeFrom, RangeFull, RangeTo, RangeToInclusive};

 pub fn flatten_strides(range: &[Expression], strides: &[Expression]) -> Expression {
-    assert_eq!(range.len(), strides.len());
+    assert_eq!(
+        range.len(),
+        strides.len(),
+        "flatten_strides: shape and strides must have matching dimensionality \
+         (got shape len {}, strides len {}). This typically means an HLIR op \
+         was constructed or extracted with mismatched fields — common culprit \
+         is a Scatter / Gather kernel whose index_strides or src_strides list \
+         wasn't populated alongside index_shape.",
+        range.len(),
+        strides.len(),
+    );
    let mut current_elem_size = Expression::from(1);
    let mut flat_stride = Expression::from(0);
    for (dim, (range, stride)) in range.iter().zip(strides).enumerate().rev() {