luminal_python: suppress torch.export _guards_fn instead of disabling auto-dynamic shapes

Replaces the WIP `automatic_dynamic_shapes = False` workaround (commit 3a3cd049) with a targeted monkey-patch of `torch.export._unlift. _ok_to_generate_guards_fn`. That function already supports a call-stack opt-out (used by executorch / modai / on_device_ai / torchao); we extend it with a "luminal" check so torch.export skips inserting the `_guards_fn` submodule whenever luminal is the embedder. Why the previous workaround was costly: with `automatic_dynamic_shapes = False`, the bench loop's `compiled(input_ids, cache_position=tensor([k]))` recompiles once per `cache_position` *value*, i.e. one full luminal compile per generated token. gemma3-4b smoke = ~2 hr CPU + 200 GB host RSS. The L NameError it was working around fires during aot_autograd's fx.Interpreter trace of a re-exported GraphModule that contains the L-referencing `_guards_fn` body — a dead-end for any non-dynamo consumer of the exported graph. Skipping `_guards_fn` generation at the source restores the compile-once-run-many behaviour of dynamic-shape promotion: dynamo promotes the varying dim to a SymInt on the second compile and reuses the same compiled graph for all subsequent values. The monkey-patch is scoped to luminal's call stack — other consumers of `torch.export` in the same Python process see unmodified behaviour. Verified via a multi-shape compile smoke (`compiled(rand(4,8))` then `compiled(rand(5,8))`): no L NameError. The remaining downstream `SymInt` input passthrough is handled by `_specialize_sym_scalar` in pt2.py and is unrelated to this fix. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
luminal_python: WIP workaround for dynamo "L not defined" on gemma3
2026-06-01 21:49:47 +09:00 · 2026-05-01 23:23:22 +00:00 · 2026-05-01 22:15:17 +00:00 · 2026-05-01 22:14:55 +00:00 · 2026-05-01 18:41:25 +00:00 · 2026-05-01 18:37:26 +00:00
193 changed files with 10254 additions and 37747 deletions
--- a/.github/workflows/modal-examples.yml
+++ b/.github/workflows/modal-examples.yml
@@ -18,11 +18,11 @@ jobs:
        name: "${{ matrix.example }} (Modal ${{ matrix.gpu.type }})"
        runs-on: ubuntu-latest
        environment: Modal
-        timeout-minutes: 120
+        timeout-minutes: 70
        strategy:
            fail-fast: false
            matrix:
-                example: [llama, gemma, qwen, qwen3_moe, gemma4_moe, whisper]
+                example: [llama, gemma, qwen, qwen3_moe]
                gpu:
                    - { type: "A100-80GB" }
                    # To add more GPUs, just append another entry:
--- a/.github/workflows/test-core.yml
+++ b/.github/workflows/test-core.yml
@@ -21,4 +21,4 @@ jobs:
        steps:
            - uses: actions/checkout@v6
            - name: Run tests
-              run: cargo test --release --workspace --exclude luminal_cuda_lite --exclude luminal_metal --exclude luminal_bench --verbose
+              run: cargo test --workspace --exclude luminal_cuda_lite --exclude luminal_metal --exclude luminal_bench --verbose
--- a/.github/workflows/test-cuda.yml
+++ b/.github/workflows/test-cuda.yml
@@ -18,7 +18,7 @@ jobs:
        name: Cuda Unit Tests
        runs-on: ubuntu-latest
        environment: Modal
-        timeout-minutes: 120
+        timeout-minutes: 30

        steps:
            - uses: actions/checkout@v6
--- a/.github/workflows/test-metal.yml
+++ b/.github/workflows/test-metal.yml
@@ -16,4 +16,4 @@ jobs:
        steps:
            - uses: actions/checkout@v6
            - name: Run Metal crate tests
-              run: rustup update; cargo test --release -p luminal_metal --verbose -- --test-threads=1
+              run: rustup update; cargo test -p luminal_metal --verbose -- --test-threads=1
--- a/.github/workflows/test-python-cuda.yml
+++ b/.github/workflows/test-python-cuda.yml
@@ -18,7 +18,7 @@ jobs:
        name: Python CUDA Tests
        runs-on: ubuntu-latest
        environment: Modal
-        timeout-minutes: 120
+        timeout-minutes: 60
        defaults:
            run:
                working-directory: crates/luminal_python
@@ -38,7 +38,7 @@ jobs:
                  MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
                  MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
                  HF_TOKEN: ${{ secrets.HF_TOKEN }}
-              run: modal run modal_pytest_runner.py --gpu A100 --timeout 7200 --profile --profile-output-dir luminal_artifacts/pytest-profiling/github-${{ github.run_id }}-${{ github.run_attempt }} tests/ -v -s -m "not slow"
+              run: modal run modal_pytest_runner.py --gpu A100 --timeout 3300 --profile --profile-output-dir luminal_artifacts/pytest-profiling/github-${{ github.run_id }}-${{ github.run_attempt }} tests/ -v -s -m "not slow"
            - name: Upload Modal pytest profiling artifacts
              if: always()
              uses: actions/upload-artifact@v4
--- a/.github/workflows/test-python-native.yml
+++ b/.github/workflows/test-python-native.yml
@@ -23,6 +23,6 @@ jobs:
            - name: Update Rust toolchain
              run: rustup update
            - name: Build maturin extension
-              run: uv run maturin develop --manifest-path rust/Cargo.toml --profile release
+              run: uv run maturin develop --manifest-path rust/Cargo.toml
            - name: Run pytest
              run: uv run pytest tests/test_hlir_ops.py tests/test_unary.py -v -m "not slow"
--- a/.gitignore
+++ b/.gitignore
@@ -37,3 +37,9 @@ __pycache__/
 dist/
 build/
 uv.lock
+
+# TTFT benchmark SQLite database (per-machine state)
+benchmarks/ttft/bench.db
+benchmarks/ttft/bench.db-journal
+benchmarks/ttft/bench.db-wal
+benchmarks/ttft/bench.db-shm
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -8,14 +8,4 @@ All other functionality is split into crates in the `crates/` directory. For ins
 ## Testing Instructions
 - Find the CI plan in the .github/workflows folder.
 - Currently running `cargo test` in luminal_metal and luminal_cuda_lite require access to an Apple and Nvidia GPU respectively.
- PRs must have no clippy errors and `cargo fmt` must be ran before a PR is submitted.
-
-## Debugging and Correctness
- Treat model examples as specifications of the intended architecture. Do not change model code, prompt templates, weights, or example logic to hide compiler/runtime/search bugs unless the model code is demonstrably semantically wrong.
- When outputs are incorrect, first root-cause the failing compiler/runtime path. Prefer isolating the bad LLIR/HLIR graph, rewrite, op lowering, shape/stride assumption, layout contract, or runtime implementation that caused the mismatch.
- Avoid narrow special-case fixes. A fix should state and enforce the general invariant it relies on, or explicitly document why the affected operation is only valid for a restricted layout/shape and ensure rewrites enforce that restriction.
- For e-graph/search issues, assume all selectable LLIR graphs are intended to be semantically equivalent. If two selectable graphs disagree, debug the equivalence violation rather than selecting around the bad graph.
- Add regression tests at the level where the bug occurred. Prefer tests that compare against a semantic reference such as `NativeRuntime` or a small independent reference, and use fixed seeds for any randomized search/fuzz test so failures are reproducible.
-
-## Compiler Rewrite Boundary
- All graph pattern matching and op selection must be expressed in egglog rewrites. Do not add Rust-side LLIR graph post-passes that search for op patterns, fuse kernels, select backend ops, or otherwise rewrite extracted graphs after egglog. If a backend needs a fused/specialized op, add the match and rewrite in egglog and let extraction produce that op directly.
+- PRs must have no clippy errors and `cargo fmt` must be ran before a PR is submitted.
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -25,7 +25,6 @@ generational-box = "0.5.6"
 serde_json = "1.0.140"
 egglog = {git="https://github.com/egraphs-good/egglog", rev="0a8cc35a6c68d0460c20449d5fa19ca3caba2923"}
 egglog-ast = {git="https://github.com/egraphs-good/egglog", rev="0a8cc35a6c68d0460c20449d5fa19ca3caba2923"}
-egglog-reports = {git="https://github.com/egraphs-good/egglog", rev="0a8cc35a6c68d0460c20449d5fa19ca3caba2923"}
 egraph-serialize = { version = "0.3.0", default-features = false, features = ["graphviz", "serde"]}
 tracing = "0.1.43"
 paste = "1.0.15"
--- a/README.md
+++ b/README.md
@@ -55,27 +55,23 @@ Luminal can run Q8 Llama 3 8B at ~80% of theoretical max performance on an H100.

 The core of Luminal is and always will be minimal. It should be possible to understand the entire core library in an afternoon.

-### PyTorch-native
-
-Luminal directly integrates with PyTorch as a compiler backend. Simply do `torch.compile(model, backend=luminal_cuda)` to compile your PyTorch models. We also have an excellent tensor API in Rust.
-
 ### RISC-style architecture

-Everything in Luminal boils down to 15 primitive ops:
+Everything in Luminal boils down to 14 primitive ops:

 - Unary - `Log2, Exp2, Sin, Sqrt, Recip`
 - Binary - `Add, Mul, Mod, LessThan`
- Other - `SumReduce, MaxReduce, Iota, Gather, Scatter, Cast`
+- Other - `SumReduce, MaxReduce, Iota, Gather, Cast`

-These ops are enough to support transformers, convnets, and nearly every popular model in the world.
+These ops are enough to support transformers, convnets, and nearly every popular model.

 ### Search

-The best heuristic is no heuristic. Luminal tries to search every possible decision to give the compiler the flexibility to discover complex optimizations. This allows us to automatically discover Flash Attention and other similarly complex optimizations without relying on hand-written operations or heuristics. It also allows us to stay extremely small and simple long into the future and beat the performance of far larger frameworks.
+The best heuristic is no heuristic. We try to search every possible decision to give the compiler the most flexibility to discover complex optimizations. This allows us to automatically derive Flash Attention and other similarly complex rewrites. It also allows us to stay extremely small long into the future and beat the performance of far larger frameworks with tons of handwritten kernels.

 ### Native

-The current ML ecosystem is too fragmented, and the solution isn't another layer of abstraction. Luminal is written in rust, and interacts directly with the accelerator APIs (CUDA, Metal, etc.). No indirections or abstractions, compatability layers, docker containers, or virtual environments. Just a statically-linked rust crate.
+The current ML ecosystem is too fragmented, and the solution isn't another layer of abstraction. Luminal is written in rust, and interacts directly with the CUDA / Metal APIs. No indirections or abstractions, docker containers, or virtual environments. Just a statically-linked rust crate.

 ### Validated against Pytorch

@@ -89,45 +85,39 @@ Most deep learning libraries are eager-first, meaning each op call directly oper

 However, this isn't great for performance. What makes sense for a developer doesn't work well for the machine, in the same way that no one writes assembly by hand. Most libraries try to fix this problem by tacking on operator fusion or JIT compilation to try to change the compilation flow to something better for the machine. Turns out this is [super](https://docs.pytorch.org/docs/stable/torch.compiler_dynamo_overview.html) [difficult](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) [even](https://pytorch.org/docs/stable/jit.html) [for](https://pytorch.org/docs/stable/fx.html#torch.fx.symbolic_trace) Pytorch!

-### What about XLA?
-
-XLA, torch.compile, TVM, and other traditional compiler stacks suffer from complexity explosion. They are made up of a very large set of destructive (one-direction) rewrite rules that lower and optimize a graph from a high-level representation to low-level machine code. But since these rules are destructive, they are required to only fire when it's certian that there's a performance benefit. This leads to the rules becoming very complex, special-cased, and numerous. Once additional hardware backends, model architectures, and new dtypes get thrown in, they suffer from the weight of their complexity and often produce very suboptimal code, requiring DSLs like Pallas or Triton to regain performance.
-
 ### Compile everything

-A core tenet of Luminal is ahead-of-time compilation. Whenever possible, push everything to compile time and leave nothing to run time. Luminal takes an approach more similar to [XLA](https://www.tensorflow.org/xla), and [tinygrad](https://github.com/tinygrad/tinygrad). Everything's static here. When you write out an expression like `x + y`, no actual computation happens. The operation is recorded to a directed acyclic computation graph for execution later. Only once `graph.execute()` is ran does the computation happen. _But isn't that just lazy execution?_ Yes it is! But in luminal **everything is done this way**. All neural networks are built up as a static computation graphs, compiled, and executed later.
-
-### First-class dynamism
-
-A fully-static world would be nice, but we live in a world of nessecary dynamism. So we model dynamic shapes natively, as symbolic dimensions. Luminal supports arbitrary symbolic dimensions, including complex expressions, to give us shapes like `(s, 4096)`, `(b, h, w + 3)`, etc. This rich representation gives the compiler full visibility into shapes and lets it still do aggressive specialization.
+A core tenet of Luminal is ahead-of-time compilation. Whenever possible, push everything to compile time and leave nothing to run time. Luminal takes an approach more similar to [XLA](https://www.tensorflow.org/xla), and [tinygrad](https://github.com/tinygrad/tinygrad). Everything's static here. When you write out an expression like `x + y`, no actual computation happens. The operation is recorded to a directed acyclic computation graph for execution later. Only once `graph.execute()` is ran does the computation happen. _But isn't that just lazy execution?_ Yes it is! But in luminal **everything is done this way**. All neural networks are built up as one or a few static computation graphs, compiled, and executed later.

 **But why?**

-A consequence of this is that the actual computation that gets ran can be radically different than the code that was written. Since we have an entire neural network fully represented in a compute graph, Luminal has global knowledge. This means we can push most ML complexity to the compiler. For instance, devices, datatypes, and even autograd is modeled ahead of time and optimized by the compiler!
+A consequence of this is that the actual computation that gets ran can be radically different than the code that was written. Since we have an entire neural network fully represented in a compute graph, our compilers have global knowledge. This means we can push most ML complexity to the compilers. For instance, devices, datatypes, and execution schedules are all handled by compliers. Even autograd is handled by a compiler!

 Now we can do:

 - Aggressive kernel fusion
 - Shape-specific kernels compiled at runtime
- Low-precision dtypes (mxfp4, nvfp4, fp8, etc.)
- Complex mutli-device parallelism topologies, searched ahead-of-time
- Networks can be written in generic code, but compiled and ran fast on hyper-specific architectures
+- Devices and Dtypes are handled through compilers (just run the CUDA compiler to convert the graph to use CUDA kernels, then the fp16 compiler to convert to half-precision kernels)
+- Networks can be written in generic code, but compiled and ran fast on hyper-specific architectures (try writing a PyTorch network that works with both TF32 dtypes and TPUs; get ready for if statement hell...)

 ## Where are we?

- Native PyTorch support
- Many kernel libraries supported in the search space (FlashInfer, cuBLASLt, etc.)
- Many models implemented in our Rust tensor API in `examples/`.
+- Search is partially merged. We are between 1.0 and 2.0 (search), which will be completed within the next month or so.
+- Metal and Cuda are supported for running models on Macs and Nvidia GPUs respectively, in both full and half precision.
+- Full training support with graph-based autograd.
+- Llama 3, Phi 3, Whisper and Yolo v8 are implemented in `examples/`. See instructions above for running.
 - We have a small library of NN modules in `luminal_nn`, including transformers.
 - A significant amount of high-level ops are implemented in `hl_ops`. We are aiming to match the most used ~80% of the pytorch api.

 Some things on the roadmap:

- More fine-grained dialects supporting thread- and warp-level intrinsics like TMA and tcgen.05
- ROCm backend
- More public infernce accelerator backends (coming very soon...)
- Public benchmarking suite
- Automatically searched model parallelism (TP, PP, EPS, EPR, SP, etc.)
+- Expand the search space to utilize Tensor Cores more flexibly
+- Bring cuda to parity with Metal
+- Add Blackwell intrinsics, such as TMEM and TMA
+- Build a ROCm backend
+- Build benchmarking suite to test against other libs
+- Distributed data, pipeline and tensor parallel.
+- Beat PT 2.0 perf on LLM inference _and_ training
 - Write compiler for quantum photonic retro encabulator
 - Build dyson swarm

--- a/benchmarks/ttft/bench_python_baseline.py
+++ b/benchmarks/ttft/bench_python_baseline.py
@@ -0,0 +1,117 @@
+"""Pure HuggingFace/PyTorch TTFT + TPOT bench. Prints a JSON line on stdout.
+
+Measures:
+  TTFT — sum of single-token forward-pass durations over the prompt, using
+         a StaticCache. Methodology matches bench_python_luminal.py and the
+         rust path so the cross-path comparison is apples-to-apples.
+  TPOT — average time per output token during KV-cache greedy decode.
+"""
+
+import argparse
+import json
+import statistics
+import time
+
+import torch
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+from transformers.cache_utils import StaticCache
+
+from bench_utils import encode_prompt, measure_tpot, static_cache_config
+
+DEFAULT_MODEL = "NousResearch/Meta-Llama-3-8B-Instruct"
+DEFAULT_PROMPT = "Explain what a neural network is in a paragraph."
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--model", default=DEFAULT_MODEL)
+    ap.add_argument("--prompt", default=DEFAULT_PROMPT)
+    ap.add_argument("--warmups", type=int, default=1)
+    ap.add_argument("--iters", type=int, default=3)
+    ap.add_argument("--dtype", default="float32", choices=["float32", "bfloat16", "float16"])
+    ap.add_argument("--decode-tokens", type=int, default=50,
+                    help="Number of tokens to generate for TPOT measurement (0 = skip).")
+    ap.add_argument("--max-cache-len", type=int, default=256,
+                    help="StaticCache max sequence length.")
+    args = ap.parse_args()
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    dtype = {"float32": torch.float32, "bfloat16": torch.bfloat16, "float16": torch.float16}[args.dtype]
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    input_ids = encode_prompt(tokenizer, args.prompt, device)
+    prompt_tokens = int(input_ids.shape[-1])
+
+    config = AutoConfig.from_pretrained(args.model)
+    config._attn_implementation = "eager"
+
+    model = (
+        AutoModelForCausalLM.from_pretrained(args.model, config=config, torch_dtype=dtype)
+        .eval()
+        .to(device)
+    )
+
+    single_token = torch.zeros(1, 1, dtype=torch.long, device=device)
+
+    cache_config = static_cache_config(config)
+
+    def make_cache():
+        return StaticCache(
+            config=cache_config,
+            max_batch_size=1,
+            max_cache_len=args.max_cache_len,
+            device=device,
+            dtype=dtype,
+        )
+
+    def measure_ttft() -> float:
+        """Sum of per-token forward-pass durations over prompt_tokens steps."""
+        kv = make_cache()
+        # Eager init at position 0 to satisfy StaticCache.lazy_initialization.
+        with torch.no_grad():
+            model(single_token, past_key_values=kv,
+                  cache_position=torch.tensor([0], device=device))
+        total_ms = 0.0
+        for pos in range(1, prompt_tokens):
+            if device.type == "cuda":
+                torch.cuda.synchronize()
+            t0 = time.perf_counter()
+            with torch.no_grad():
+                model(single_token, past_key_values=kv,
+                      cache_position=torch.tensor([pos], device=device))
+                if device.type == "cuda":
+                    torch.cuda.synchronize()
+            total_ms += (time.perf_counter() - t0) * 1000.0
+        return total_ms
+
+    for _ in range(args.warmups):
+        measure_ttft()
+
+    ttft_samples_ms = [measure_ttft() for _ in range(args.iters)]
+
+    result = {
+        "path": "python_baseline",
+        "model": args.model,
+        "device": str(device),
+        "dtype": args.dtype,
+        "prompt_tokens": prompt_tokens,
+        "iters": args.iters,
+        "ttft_ms": statistics.median(ttft_samples_ms),
+        "ttft_ms_mean": sum(ttft_samples_ms) / len(ttft_samples_ms),
+        "ttft_ms_samples": ttft_samples_ms,
+        "note": "sequential per-token, StaticCache KV cache",
+    }
+
+    if args.decode_tokens > 0:
+        tpot_samples_ms = measure_tpot(model, input_ids, device, args.decode_tokens)
+        tpot_ms = sum(tpot_samples_ms) / len(tpot_samples_ms)
+        result["decode_tokens"] = args.decode_tokens
+        result["tpot_ms"] = tpot_ms
+        result["tpot_ms_samples"] = tpot_samples_ms
+        result["throughput_tps"] = 1000.0 / tpot_ms
+
+    print("BENCH_RESULT " + json.dumps(result))
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks/ttft/bench_python_luminal.py
+++ b/benchmarks/ttft/bench_python_luminal.py
@@ -0,0 +1,196 @@
+"""Python -> Luminal TTFT + TPOT bench via torch.compile(backend=luminal_backend).
+
+Methodology mirrors examples/llama (the Rust path):
+  - One eager prefill step initialises the StaticCache (required by transformers'
+    StaticCache.lazy_initialization) before compilation.
+  - TTFT: run one forward pass per prompt token sequentially, each advancing
+    cache_position by 1; sum durations.
+  - TPOT: run --decode-tokens more single-token passes; average durations.
+  - StaticCache pre-allocates K/V buffers up to max_cache_len; no growing allocation.
+
+Prints a BENCH_RESULT JSON line on stdout.
+"""
+
+import argparse
+import gc
+import json
+import statistics
+import time
+
+import torch
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+from transformers.cache_utils import StaticCache
+
+from bench_utils import encode_prompt, static_cache_config
+from luminal import luminal_backend
+
+DEFAULT_MODEL = "NousResearch/Meta-Llama-3-8B-Instruct"
+DEFAULT_PROMPT = "Explain what a neural network is in a paragraph."
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--model", default=DEFAULT_MODEL)
+    ap.add_argument("--prompt", default=DEFAULT_PROMPT)
+    ap.add_argument("--warmups", type=int, default=1)
+    ap.add_argument("--iters", type=int, default=3)
+    ap.add_argument(
+        "--search-iters",
+        type=int,
+        default=500,
+        help="Egraph search iterations (matches examples/llama default of 500).",
+    )
+    ap.add_argument(
+        "--decode-tokens",
+        type=int,
+        default=50,
+        help="Tokens to generate for TPOT measurement (0 = skip TPOT).",
+    )
+    ap.add_argument(
+        "--max-cache-len",
+        type=int,
+        default=256,
+        help="StaticCache max sequence length.",
+    )
+    ap.add_argument(
+        "--dtype",
+        default="float32",
+        choices=["float32", "bfloat16", "float16"],
+        help="Torch dtype for model + StaticCache.",
+    )
+    args = ap.parse_args()
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    dtype = {"float32": torch.float32, "bfloat16": torch.bfloat16, "float16": torch.float16}[args.dtype]
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    input_ids = encode_prompt(tokenizer, args.prompt, device)
+    prompt_tokens = int(input_ids.shape[-1])
+
+    config = AutoConfig.from_pretrained(args.model)
+    config._attn_implementation = "eager"
+
+    model = (
+        AutoModelForCausalLM.from_pretrained(args.model, config=config, torch_dtype=dtype)
+        .eval()
+        .to(device)
+    )
+
+    single_token = torch.zeros(1, 1, dtype=torch.long, device=device)
+
+    cache_config = static_cache_config(config)
+
+    def make_cache():
+        return StaticCache(
+            config=cache_config,
+            max_batch_size=1,
+            max_cache_len=args.max_cache_len,
+            device=device,
+            dtype=dtype,
+        )
+
+    # Step 0: run ONE eager prefill to initialise the cache tensors and call
+    # mark_static_address (required by transformers' StaticCache before compile).
+    cache = make_cache()
+    with torch.no_grad():
+        model(single_token, past_key_values=cache, cache_position=torch.tensor([0], device=device))
+
+    # Compile for a single-token input — same graph is reused for every step.
+    # Compilation happens on the first call after the eager init above.
+    t0 = time.perf_counter()
+    compiled = torch.compile(
+        model,
+        backend=luminal_backend,
+        options={"search_iterations": args.search_iters},
+    )
+    cache_position = torch.tensor([1], dtype=torch.long, device=device)
+    with torch.no_grad():
+        compiled(single_token, past_key_values=cache, cache_position=cache_position)
+        if device.type == "cuda":
+            torch.cuda.synchronize()
+    compile_ms = (time.perf_counter() - t0) * 1000.0
+
+    gc.collect()
+    if device.type == "cuda":
+        torch.cuda.empty_cache()
+
+    def one_step(pos: int, kv_cache):
+        cache_pos = torch.tensor([pos], dtype=torch.long, device=device)
+        with torch.no_grad():
+            compiled(single_token, past_key_values=kv_cache, cache_position=cache_pos)
+            if device.type == "cuda":
+                torch.cuda.synchronize()
+
+    def measure_ttft():
+        """Sum of per-token forward-pass durations over prompt_tokens steps.
+
+        Uses a fresh cache so each TTFT measurement is independent.
+        """
+        kv = make_cache()
+        # Eager init for this fresh cache (required before compiled can run on it).
+        with torch.no_grad():
+            model(single_token, past_key_values=kv, cache_position=torch.tensor([0], device=device))
+        total_ms = 0.0
+        # Step 0 was the eager init above; measure from step 1 to prompt_tokens.
+        for pos in range(1, prompt_tokens):
+            if device.type == "cuda":
+                torch.cuda.synchronize()
+            t0 = time.perf_counter()
+            one_step(pos, kv)
+            total_ms += (time.perf_counter() - t0) * 1000.0
+        return total_ms
+
+    def measure_tpot(n, start_pos: int):
+        """Average single-token forward-pass duration over n decode steps."""
+        kv = make_cache()
+        # Eager init
+        with torch.no_grad():
+            model(single_token, past_key_values=kv, cache_position=torch.tensor([0], device=device))
+        # One warmup step.
+        one_step(1, kv)
+        step_times_ms = []
+        for i in range(n):
+            pos = start_pos + i
+            if device.type == "cuda":
+                torch.cuda.synchronize()
+            t0 = time.perf_counter()
+            one_step(pos, kv)
+            step_times_ms.append((time.perf_counter() - t0) * 1000.0)
+        return step_times_ms
+
+    # Warmups before timing TTFT (all run after compilation is complete).
+    for _ in range(args.warmups):
+        measure_ttft()
+
+    ttft_samples_ms = [measure_ttft() for _ in range(args.iters)]
+
+    tpot_ms_samples = []
+    if args.decode_tokens > 0:
+        tpot_ms_samples = measure_tpot(args.decode_tokens, start_pos=prompt_tokens)
+
+    tpot_ms = sum(tpot_ms_samples) / len(tpot_ms_samples) if tpot_ms_samples else None
+    throughput_tps = (1000.0 / tpot_ms) if tpot_ms else None
+
+    result = {
+        "path": "python_luminal",
+        "model": args.model,
+        "device": str(device),
+        "dtype": args.dtype,
+        "prompt_tokens": prompt_tokens,
+        "iters": args.iters,
+        "ttft_ms": statistics.median(ttft_samples_ms),
+        "ttft_ms_mean": sum(ttft_samples_ms) / len(ttft_samples_ms),
+        "ttft_ms_samples": ttft_samples_ms,
+        "compile_ms": compile_ms,
+        "search_iters": args.search_iters,
+        "decode_tokens": args.decode_tokens if args.decode_tokens > 0 else None,
+        "tpot_ms": tpot_ms,
+        "tpot_ms_samples": tpot_ms_samples,
+        "throughput_tps": throughput_tps,
+        "note": "sequential per-token, StaticCache KV cache",
+    }
+    print("BENCH_RESULT " + json.dumps(result))
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks/ttft/bench_python_torch_compile.py
+++ b/benchmarks/ttft/bench_python_torch_compile.py
@@ -0,0 +1,138 @@
+"""Vanilla torch.compile TTFT + TPOT bench. Prints a JSON line on stdout.
+
+Uses the default inductor backend (torch.compile without a custom backend).
+TTFT uses sequential per-token prefill with a StaticCache so the methodology
+matches bench_python_baseline.py, bench_python_luminal.py, and the rust path.
+"""
+
+import argparse
+import json
+import statistics
+import time
+
+import torch
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+from transformers.cache_utils import StaticCache
+
+from bench_utils import encode_prompt, measure_tpot, static_cache_config
+
+DEFAULT_MODEL = "NousResearch/Meta-Llama-3-8B-Instruct"
+DEFAULT_PROMPT = "Explain what a neural network is in a paragraph."
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--model", default=DEFAULT_MODEL)
+    ap.add_argument("--prompt", default=DEFAULT_PROMPT)
+    ap.add_argument("--warmups", type=int, default=1)
+    ap.add_argument("--iters", type=int, default=3)
+    ap.add_argument("--dtype", default="float32", choices=["float32", "bfloat16", "float16"])
+    ap.add_argument(
+        "--decode-tokens", type=int, default=50,
+        help="Number of tokens to generate for TPOT measurement (0 = skip).",
+    )
+    ap.add_argument("--max-cache-len", type=int, default=256,
+                    help="StaticCache max sequence length.")
+    args = ap.parse_args()
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    dtype = {"float32": torch.float32, "bfloat16": torch.bfloat16, "float16": torch.float16}[args.dtype]
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    input_ids = encode_prompt(tokenizer, args.prompt, device)
+    prompt_tokens = int(input_ids.shape[-1])
+
+    config = AutoConfig.from_pretrained(args.model)
+    config._attn_implementation = "eager"
+
+    model = (
+        AutoModelForCausalLM.from_pretrained(args.model, config=config, torch_dtype=dtype)
+        .eval()
+        .to(device)
+    )
+
+    single_token = torch.zeros(1, 1, dtype=torch.long, device=device)
+
+    cache_config = static_cache_config(config)
+
+    def make_cache():
+        return StaticCache(
+            config=cache_config,
+            max_batch_size=1,
+            max_cache_len=args.max_cache_len,
+            device=device,
+            dtype=dtype,
+        )
+
+    # Eager init on the uncompiled model so the StaticCache buffers get
+    # registered (mark_static_address) before torch.compile traces them.
+    init_cache = make_cache()
+    with torch.no_grad():
+        model(single_token, past_key_values=init_cache,
+              cache_position=torch.tensor([0], device=device))
+
+    compiled = torch.compile(model)
+
+    # First compiled call triggers JIT compilation; time it as compile_ms.
+    if device.type == "cuda":
+        torch.cuda.synchronize()
+    t0 = time.perf_counter()
+    with torch.no_grad():
+        compiled(single_token, past_key_values=init_cache,
+                 cache_position=torch.tensor([1], device=device))
+        if device.type == "cuda":
+            torch.cuda.synchronize()
+    compile_ms = (time.perf_counter() - t0) * 1000.0
+
+    def measure_ttft() -> float:
+        """Sum of per-token compiled-forward durations over prompt_tokens steps."""
+        kv = make_cache()
+        # Fresh cache needs eager init via the uncompiled model first.
+        with torch.no_grad():
+            model(single_token, past_key_values=kv,
+                  cache_position=torch.tensor([0], device=device))
+        total_ms = 0.0
+        for pos in range(1, prompt_tokens):
+            if device.type == "cuda":
+                torch.cuda.synchronize()
+            t0 = time.perf_counter()
+            with torch.no_grad():
+                compiled(single_token, past_key_values=kv,
+                         cache_position=torch.tensor([pos], device=device))
+                if device.type == "cuda":
+                    torch.cuda.synchronize()
+            total_ms += (time.perf_counter() - t0) * 1000.0
+        return total_ms
+
+    for _ in range(args.warmups):
+        measure_ttft()
+
+    ttft_samples_ms = [measure_ttft() for _ in range(args.iters)]
+
+    result = {
+        "path": "python_torch_compile",
+        "model": args.model,
+        "device": str(device),
+        "dtype": args.dtype,
+        "prompt_tokens": prompt_tokens,
+        "iters": args.iters,
+        "ttft_ms": statistics.median(ttft_samples_ms),
+        "ttft_ms_mean": sum(ttft_samples_ms) / len(ttft_samples_ms),
+        "ttft_ms_samples": ttft_samples_ms,
+        "compile_ms": compile_ms,
+        "note": "sequential per-token, StaticCache KV cache (torch.compile inductor)",
+    }
+
+    if args.decode_tokens > 0:
+        tpot_samples_ms = measure_tpot(compiled, input_ids, device, args.decode_tokens)
+        tpot_ms = sum(tpot_samples_ms) / len(tpot_samples_ms)
+        result["decode_tokens"] = args.decode_tokens
+        result["tpot_ms"] = tpot_ms
+        result["tpot_ms_samples"] = tpot_samples_ms
+        result["throughput_tps"] = 1000.0 / tpot_ms
+
+    print("BENCH_RESULT " + json.dumps(result))
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks/ttft/bench_utils.py
+++ b/benchmarks/ttft/bench_utils.py
@@ -0,0 +1,94 @@
+"""Shared helpers for the Python benchmark scripts."""
+
+import time
+
+import torch
+
+
+class _CfgWithoutKvShared:
+    """Wrapper that hides `num_kv_shared_layers` from a HF config.
+
+    transformers 5.6 has a bug in StaticCache.__init__:
+        if hasattr(config, "num_kv_shared_layers"):
+            layer_types = layer_types[: -config.num_kv_shared_layers]
+    For configs where the attribute is 0 (e.g. Gemma-4), `[:-0]` returns an
+    empty list, leaving StaticCache with zero layer slots, and the LM's
+    first `past_key_values.update(..., layer_idx=0)` raises IndexError.
+
+    This wrapper makes `hasattr(...)` return False so the bad branch never
+    fires. Used via `static_cache_config(config)` below.
+    """
+    __slots__ = ("_inner",)
+
+    def __init__(self, inner):
+        object.__setattr__(self, "_inner", inner)
+
+    def __getattr__(self, name):
+        if name == "num_kv_shared_layers":
+            raise AttributeError(name)
+        return getattr(self._inner, name)
+
+    def get_text_config(self, *args, **kwargs):
+        return _CfgWithoutKvShared(self._inner.get_text_config(*args, **kwargs))
+
+
+def static_cache_config(config):
+    """Return a config suitable for `StaticCache(config=..., ...)`.
+
+    Two normalizations:
+      1. Multimodal wrappers (Gemma4ForConditionalGeneration, ...) nest the
+         actual LM config under `.text_config`. Pass that, not the wrapper,
+         so layer/head counts match the inner LM.
+      2. If the resulting config has `num_kv_shared_layers == 0`, wrap it to
+         hide the attribute (works around the transformers 5.6 slice bug).
+    """
+    cfg = getattr(config, "text_config", config)
+    if getattr(cfg, "num_kv_shared_layers", None) == 0:
+        cfg = _CfgWithoutKvShared(cfg)
+    return cfg
+
+
+def encode_prompt(tokenizer, prompt: str, device):
+    """Tokenize prompt using chat template if available, falling back to raw tokenization."""
+    messages = [{"role": "user", "content": prompt}]
+    try:
+        encoded = tokenizer.apply_chat_template(
+            messages, add_generation_prompt=True, return_tensors="pt"
+        )
+    except (ValueError, AttributeError):
+        encoded = tokenizer(prompt, return_tensors="pt")
+    if hasattr(encoded, "input_ids"):
+        return encoded.input_ids.to(device)
+    if isinstance(encoded, dict):
+        return encoded["input_ids"].to(device)
+    return encoded.to(device)
+
+
+def measure_tpot(model, input_ids, device, decode_tokens: int) -> list[float]:
+    """Prefill once with KV cache, then time each subsequent single-token decode step."""
+    with torch.no_grad():
+        out = model(input_ids, use_cache=True)
+        if device.type == "cuda":
+            torch.cuda.synchronize()
+        past = out.past_key_values
+        next_id = out.logits[:, -1:].argmax(-1)
+
+        out = model(next_id, past_key_values=past, use_cache=True)
+        if device.type == "cuda":
+            torch.cuda.synchronize()
+        past = out.past_key_values
+        next_id = out.logits[:, -1:].argmax(-1)
+
+        step_times_ms = []
+        for _ in range(decode_tokens):
+            if device.type == "cuda":
+                torch.cuda.synchronize()
+            t0 = time.perf_counter()
+            out = model(next_id, past_key_values=past, use_cache=True)
+            if device.type == "cuda":
+                torch.cuda.synchronize()
+            step_times_ms.append((time.perf_counter() - t0) * 1000.0)
+            past = out.past_key_values
+            next_id = out.logits[:, -1:].argmax(-1)
+
+    return step_times_ms
--- a/benchmarks/ttft/benchmarks.toml
+++ b/benchmarks/ttft/benchmarks.toml
@@ -0,0 +1,92 @@
+[ur_test]
+models = ["llama-8b", "qwen3-4b", "gemma3-4b", "gemma4-moe", "qwen3-moe"]
+# 3-point sweep (low/mid/high). The previous list [5, 10, 20, 50, 100, 500]
+# spent ~62 extra minutes on s=5/s=20/s=50 with little additional information.
+search_sweep_iters = [10, 100, 500]
+
+[configs.llama-8b]
+model = "NousResearch/Meta-Llama-3-8B-Instruct"
+rust_package = "llama"
+search_iters = 500
+iters = 10
+warmups = 2
+decode_tokens = 50
+# On-disk weights are bf16-majority. fp32 upcast doubled python_luminal's
+# egglog Search peak past the 525 GB unified pool and triggered SIGKILLs on
+# gemma3-4b (and same risk here). bf16 matches rust's load path.
+dtype = "bfloat16"
+
+[configs.as_fast_as_possible]
+prompt = "The"
+search_iters = 1
+iters = 1
+warmups = 0
+decode_tokens = 5
+
+[configs.qwen3-4b]
+model = "Qwen/Qwen3-4B"
+rust_package = "qwen"
+search_iters = 50
+iters = 10
+warmups = 2
+decode_tokens = 20
+# bf16-majority on-disk; see llama-8b note.
+dtype = "bfloat16"
+
+[configs.gemma3-4b]
+model = "unsloth/gemma-3-4b-it"
+rust_package = "gemma"
+search_iters = 50
+iters = 10
+warmups = 2
+decode_tokens = 20
+# bf16-majority on-disk; see llama-8b note.
+dtype = "bfloat16"
+
+[configs.gemma4-moe]
+model = "google/gemma-4-26B-A4B"
+rust_package = "gemma4_moe"
+search_iters = 50
+iters = 10
+warmups = 2
+decode_tokens = 20
+# 26B params at fp32 = 104 GB → OOM on a 94 GB GPU. Use bf16 (matches the
+# on-disk safetensors dtype) so the python paths can actually load.
+dtype = "bfloat16"
+
+[configs.qwen3-moe]
+model = "Qwen/Qwen3-30B-A3B"
+rust_package = "qwen3_moe"
+search_iters = 50
+iters = 10
+warmups = 2
+decode_tokens = 20
+# 30B params at fp32 = 120 GB → OOM. See gemma4-moe note.
+dtype = "bfloat16"
+
+[configs.llama-8b-const]
+model = "NousResearch/Meta-Llama-3-8B-Instruct"
+rust_package = "llama"
+prompt = "We the People of the United States, in Order to form a more perfect Union, establish Justice, insure domestic Tranquility, provide for the common defence, promote the general Welfare, and secure the Blessings of Liberty to ourselves and our Posterity, do ordain and establish this Constitution for the United States of America."
+search_iters = 500
+iters = 10
+warmups = 2
+decode_tokens = 20
+
+[configs.qwen3-4b-const]
+model = "Qwen/Qwen3-4B"
+rust_package = "qwen"
+prompt = "We the People of the United States, in Order to form a more perfect Union, establish Justice, insure domestic Tranquility, provide for the common defence, promote the general Welfare, and secure the Blessings of Liberty to ourselves and our Posterity, do ordain and establish this Constitution for the United States of America."
+search_iters = 50
+iters = 10
+warmups = 2
+decode_tokens = 20
+
+[configs.gemma3-4b-const]
+model = "unsloth/gemma-3-4b-it"
+rust_package = "gemma"
+prompt = "We the People of the United States, in Order to form a more perfect Union, establish Justice, insure domestic Tranquility, provide for the common defence, promote the general Welfare, and secure the Blessings of Liberty to ourselves and our Posterity, do ordain and establish this Constitution for the United States of America."
+search_iters = 50
+iters = 10
+warmups = 2
+decode_tokens = 20
--- a/benchmarks/ttft/dashboard.html
+++ b/benchmarks/ttft/dashboard.html
@@ -0,0 +1,610 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<title>Luminal · Benchmark Dashboard</title>
+<link rel="preconnect" href="https://fonts.googleapis.com">
+<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+<link href="https://fonts.googleapis.com/css2?family=Geist:wght@300;400;500;600&family=Geist+Mono:wght@300;400;500&display=swap" rel="stylesheet">
+<script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
+<style>
+*, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
+html { -webkit-font-smoothing: antialiased; scroll-behavior: smooth; }
+
+body {
+  font-family: 'Geist', system-ui, sans-serif;
+  background: #030712;
+  color: #d7d8d9;
+  min-height: 100vh;
+  line-height: 1.5;
+}
+
+/* ── NAV ── */
+nav {
+  position: sticky;
+  top: 0;
+  z-index: 50;
+  height: 56px;
+  background: rgba(8, 15, 17, 0.92);
+  backdrop-filter: blur(8px);
+  -webkit-backdrop-filter: blur(8px);
+  border-bottom: 1px solid #2d3335;
+  display: flex;
+  align-items: center;
+  padding: 0 24px;
+  gap: 0;
+}
+.nav-brand {
+  display: flex;
+  align-items: center;
+  gap: 8px;
+  font-family: 'Geist Mono', monospace;
+  font-size: 14px;
+  font-weight: 500;
+  letter-spacing: 0.05em;
+  color: #2faa6e;
+  text-decoration: none;
+}
+.nav-dot {
+  width: 6px;
+  height: 6px;
+  background: #2faa6e;
+  border-radius: 50%;
+  flex-shrink: 0;
+  animation: pulse-glow 2s ease-in-out infinite;
+}
+.nav-sep {
+  color: #2d3335;
+  margin: 0 14px;
+  font-size: 18px;
+  font-weight: 300;
+}
+.nav-page {
+  font-family: 'Geist Mono', monospace;
+  font-size: 11px;
+  letter-spacing: 0.1em;
+  text-transform: uppercase;
+  color: #7e8385;
+}
+
+@keyframes pulse-glow {
+  0%, 100% { opacity: 1; }
+  50%       { opacity: 0.35; }
+}
+
+/* ── MAIN ── */
+main {
+  max-width: 1200px;
+  margin: 0 auto;
+  padding: 40px 24px 80px;
+}
+
+/* ── PAGE HEADER ── */
+.page-header {
+  margin-bottom: 40px;
+  padding-bottom: 32px;
+  border-bottom: 1px solid #1c2225;
+}
+.page-eyebrow {
+  font-family: 'Geist Mono', monospace;
+  font-size: 11px;
+  letter-spacing: 0.1em;
+  text-transform: uppercase;
+  color: #2faa6e;
+  margin-bottom: 10px;
+}
+.page-title {
+  font-size: 30px;
+  font-weight: 500;
+  letter-spacing: -0.025em;
+  color: #d7d8d9;
+  margin-bottom: 10px;
+}
+.page-meta {
+  font-size: 14px;
+  color: #7e8385;
+  display: flex;
+  align-items: center;
+  gap: 0;
+  flex-wrap: wrap;
+}
+.meta-sep {
+  font-family: 'Geist Mono', monospace;
+  color: #2d3335;
+  margin: 0 10px;
+}
+.meta-val {
+  font-family: 'Geist Mono', monospace;
+  font-size: 13px;
+  color: #5b5f61;
+}
+
+/* ── LEGEND STRIP ── */
+.legend-strip {
+  display: flex;
+  flex-wrap: wrap;
+  gap: 6px;
+  margin-bottom: 32px;
+}
+.legend-pill {
+  display: flex;
+  align-items: center;
+  gap: 6px;
+  font-family: 'Geist Mono', monospace;
+  font-size: 11px;
+  letter-spacing: 0.04em;
+  color: #a1a4a5;
+  background: #141b1d;
+  border: 1px solid #2d3335;
+  border-radius: 2px;
+  padding: 4px 10px;
+}
+.legend-swatch {
+  width: 8px;
+  height: 8px;
+  border-radius: 50%;
+  flex-shrink: 0;
+}
+
+/* ── SECTIONS ── */
+section { margin-bottom: 48px; }
+.section-header {
+  display: flex;
+  align-items: baseline;
+  gap: 10px;
+  margin-bottom: 16px;
+  padding-bottom: 12px;
+  border-bottom: 1px solid #1c2225;
+  flex-wrap: wrap;
+}
+.section-eyebrow {
+  font-family: 'Geist Mono', monospace;
+  font-size: 11px;
+  letter-spacing: 0.1em;
+  text-transform: uppercase;
+  color: #404647;
+}
+.section-title {
+  font-size: 18px;
+  font-weight: 500;
+  color: #d7d8d9;
+  letter-spacing: -0.01em;
+}
+.section-title .unit {
+  color: #7e8385;
+  font-weight: 400;
+}
+.section-tag {
+  font-family: 'Geist Mono', monospace;
+  font-size: 11px;
+  letter-spacing: 0.04em;
+  text-transform: uppercase;
+  color: #2faa6e;
+  background: #162322;
+  border: 1px solid #1c372e;
+  padding: 2px 8px;
+  border-radius: 2px;
+  margin-left: auto;
+}
+
+/* ── CHART GRID ── */
+.chart-grid {
+  display: grid;
+  gap: 10px;
+}
+.chart-card {
+  background: #141b1d;
+  border: 1px solid #2d3335;
+  border-radius: 2px;
+  overflow: hidden;
+  transition: border-color 150ms;
+  min-width: 0;
+}
+.chart-card:hover { border-color: #404647; }
+.chart-card-header {
+  padding: 10px 14px 0;
+  display: flex;
+  align-items: center;
+}
+.model-tag {
+  font-family: 'Geist Mono', monospace;
+  font-size: 11px;
+  letter-spacing: 0.06em;
+  text-transform: uppercase;
+  color: #7e8385;
+}
+
+/* ── FOOTER ── */
+footer {
+  max-width: 1200px;
+  margin: 0 auto;
+  padding: 20px 24px;
+  border-top: 1px solid #1c2225;
+  font-family: 'Geist Mono', monospace;
+  font-size: 11px;
+  letter-spacing: 0.04em;
+  color: #404647;
+  display: flex;
+  justify-content: space-between;
+  flex-wrap: wrap;
+  gap: 8px;
+}
+
+.section-divider {
+  border: none;
+  border-top: 1px solid #1c2225;
+  margin: 8px 0 40px;
+}
+.sweep-hint {
+  font-family: 'Geist Mono', monospace;
+  font-size: 11px;
+  letter-spacing: 0.04em;
+  color: #404647;
+  margin-bottom: 12px;
+}
+
+@media (max-width: 768px) {
+  .chart-grid { grid-template-columns: 1fr !important; }
+  .page-title { font-size: 22px; }
+}
+</style>
+</head>
+<body>
+
+<nav>
+  <a class="nav-brand" href="https://luminal.com">
+    <span class="nav-dot"></span>luminal
+  </a>
+  <span class="nav-sep">/</span>
+  <span class="nav-page">benchmarks</span>
+</nav>
+
+<main>
+
+<header class="page-header">
+  <p class="page-eyebrow">performance · time-series</p>
+  <h1 class="page-title">Benchmark Dashboard</h1>
+  <div class="page-meta">
+    <span>Last updated</span>
+    <span class="meta-sep">·</span>
+    <span class="meta-val">May 01, 2026 · 18:56</span>
+    <span class="meta-sep">·</span>
+    <span class="meta-val">1 run in history</span>
+  </div>
+</header>
+
+<div class="legend-strip">
+  <div class="legend-pill"><span class="legend-swatch" style="background:#5b5f61"></span>HF Baseline</div><div class="legend-pill"><span class="legend-swatch" style="background:#3b82f6"></span>torch.compile</div><div class="legend-pill"><span class="legend-swatch" style="background:#a855f7"></span>luminal backend</div><div class="legend-pill"><span class="legend-swatch" style="background:#e8855a"></span>Rust (luminal)</div>
+</div>
+
+
+<section>
+  <div class="section-header">
+    <span class="section-eyebrow">metric</span>
+    <h2 class="section-title">TTFT <span class="unit">over time</span></h2>
+    <span class="section-tag">Time to first token (ms)</span>
+  </div>
+  <div class="chart-grid" style="grid-template-columns: repeat(4, 1fr)">
+<div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">llama-8b</span>
+  </div>
+  <div id="c_ttft_ms_llama_8b"></div>
+  <script>
+    Plotly.newPlot("c_ttft_ms_llama_8b", [{"x": ["2026-05-01T18-56-26-996695"], "y": [705.9654394979589], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "HF Baseline", "line": {"color": "#5b5f61", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": true, "hovertemplate": "<b>HF Baseline</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [307.66548847896047], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "torch.compile", "line": {"color": "#3b82f6", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": true, "hovertemplate": "<b>torch.compile</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [461.48114453535527], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "luminal backend", "line": {"color": "#a855f7", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": true, "hovertemplate": "<b>luminal backend</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [1026.86], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "line": {"color": "#e8855a", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": true, "hovertemplate": "<b>Rust (luminal)</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}], {"plot_bgcolor": "#0d1416", "paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9"}, "margin": {"t": 16, "b": 48, "l": 52, "r": 12}, "height": 280, "xaxis": {"type": "category", "categoryorder": "array", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "tickangle": -30, "automargin": true, "zeroline": false, "categoryarray": ["2026-05-01T18-56-26-996695"], "tickvals": ["2026-05-01T18-56-26-996695"], "ticktext": ["May 01 \u00b7 18:56"]}, "yaxis": {"rangemode": "tozero", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "ticksuffix": " ms", "zeroline": false}, "legend": {"orientation": "h", "y": -0.28, "x": 0, "font": {"size": 11, "color": "#a1a4a5"}, "bgcolor": "rgba(0,0,0,0)"}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}},
+      {responsive: true, displayModeBar: false});
+  </script>
+</div><div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">qwen3-4b</span>
+  </div>
+  <div id="c_ttft_ms_qwen3_4b"></div>
+  <script>
+    Plotly.newPlot("c_ttft_ms_qwen3_4b", [{"x": ["2026-05-01T18-56-26-996695"], "y": [869.2860195587855], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "HF Baseline", "line": {"color": "#5b5f61", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>HF Baseline</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [298.27259748708457], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "torch.compile", "line": {"color": "#3b82f6", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>torch.compile</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [485.3892414830625], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "luminal backend", "line": {"color": "#a855f7", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>luminal backend</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [398.58], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "line": {"color": "#e8855a", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>Rust (luminal)</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}], {"plot_bgcolor": "#0d1416", "paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9"}, "margin": {"t": 16, "b": 16, "l": 52, "r": 12}, "height": 280, "xaxis": {"type": "category", "categoryorder": "array", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "tickangle": -30, "automargin": true, "zeroline": false, "categoryarray": ["2026-05-01T18-56-26-996695"], "tickvals": ["2026-05-01T18-56-26-996695"], "ticktext": ["May 01 \u00b7 18:56"]}, "yaxis": {"rangemode": "tozero", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "ticksuffix": " ms", "zeroline": false}, "legend": {"orientation": "h", "y": -0.28, "x": 0, "font": {"size": 11, "color": "#a1a4a5"}, "bgcolor": "rgba(0,0,0,0)", "visible": false}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}},
+      {responsive: true, displayModeBar: false});
+  </script>
+</div><div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">gemma3-4b</span>
+  </div>
+  <div id="c_ttft_ms_gemma3_4b"></div>
+  <script>
+    Plotly.newPlot("c_ttft_ms_gemma3_4b", [{"x": ["2026-05-01T18-56-26-996695"], "y": [951.1196144158021], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "HF Baseline", "line": {"color": "#5b5f61", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>HF Baseline</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [300.9451600664761], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "torch.compile", "line": {"color": "#3b82f6", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>torch.compile</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [404.43], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "line": {"color": "#e8855a", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>Rust (luminal)</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}], {"plot_bgcolor": "#0d1416", "paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9"}, "margin": {"t": 16, "b": 16, "l": 52, "r": 12}, "height": 280, "xaxis": {"type": "category", "categoryorder": "array", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "tickangle": -30, "automargin": true, "zeroline": false, "categoryarray": ["2026-05-01T18-56-26-996695"], "tickvals": ["2026-05-01T18-56-26-996695"], "ticktext": ["May 01 \u00b7 18:56"]}, "yaxis": {"rangemode": "tozero", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "ticksuffix": " ms", "zeroline": false}, "legend": {"orientation": "h", "y": -0.28, "x": 0, "font": {"size": 11, "color": "#a1a4a5"}, "bgcolor": "rgba(0,0,0,0)", "visible": false}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}},
+      {responsive: true, displayModeBar: false});
+  </script>
+</div><div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">gemma4-moe</span>
+  </div>
+  <div id="c_ttft_ms_gemma4_moe"></div>
+  <script>
+    Plotly.newPlot("c_ttft_ms_gemma4_moe", [{"x": ["2026-05-01T18-56-26-996695"], "y": [837.3980740143452], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "HF Baseline", "line": {"color": "#5b5f61", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>HF Baseline</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [245.510076492792], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "torch.compile", "line": {"color": "#3b82f6", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>torch.compile</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}], {"plot_bgcolor": "#0d1416", "paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9"}, "margin": {"t": 16, "b": 16, "l": 52, "r": 12}, "height": 280, "xaxis": {"type": "category", "categoryorder": "array", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "tickangle": -30, "automargin": true, "zeroline": false, "categoryarray": ["2026-05-01T18-56-26-996695"], "tickvals": ["2026-05-01T18-56-26-996695"], "ticktext": ["May 01 \u00b7 18:56"]}, "yaxis": {"rangemode": "tozero", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "ticksuffix": " ms", "zeroline": false}, "legend": {"orientation": "h", "y": -0.28, "x": 0, "font": {"size": 11, "color": "#a1a4a5"}, "bgcolor": "rgba(0,0,0,0)", "visible": false}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}},
+      {responsive: true, displayModeBar: false});
+  </script>
+</div><div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">qwen3-moe</span>
+  </div>
+  <div id="c_ttft_ms_qwen3_moe"></div>
+  <script>
+    Plotly.newPlot("c_ttft_ms_qwen3_moe", [{"x": ["2026-05-01T18-56-26-996695"], "y": [1565.540504961973], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "HF Baseline", "line": {"color": "#5b5f61", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>HF Baseline</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [460.077923577046], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "torch.compile", "line": {"color": "#3b82f6", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>torch.compile</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [21002.791983017232], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "luminal backend", "line": {"color": "#a855f7", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>luminal backend</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [662.07], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "line": {"color": "#e8855a", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>Rust (luminal)</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}], {"plot_bgcolor": "#0d1416", "paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9"}, "margin": {"t": 16, "b": 16, "l": 52, "r": 12}, "height": 280, "xaxis": {"type": "category", "categoryorder": "array", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "tickangle": -30, "automargin": true, "zeroline": false, "categoryarray": ["2026-05-01T18-56-26-996695"], "tickvals": ["2026-05-01T18-56-26-996695"], "ticktext": ["May 01 \u00b7 18:56"]}, "yaxis": {"rangemode": "tozero", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "ticksuffix": " ms", "zeroline": false}, "legend": {"orientation": "h", "y": -0.28, "x": 0, "font": {"size": 11, "color": "#a1a4a5"}, "bgcolor": "rgba(0,0,0,0)", "visible": false}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}},
+      {responsive: true, displayModeBar: false});
+  </script>
+</div>
+  </div>
+</section>
+<section>
+  <div class="section-header">
+    <span class="section-eyebrow">metric</span>
+    <h2 class="section-title">TPOT <span class="unit">over time</span></h2>
+    <span class="section-tag">Time per output token (ms)</span>
+  </div>
+  <div class="chart-grid" style="grid-template-columns: repeat(4, 1fr)">
+<div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">llama-8b</span>
+  </div>
+  <div id="c_tpot_ms_llama_8b"></div>
+  <script>
+    Plotly.newPlot("c_tpot_ms_llama_8b", [{"x": ["2026-05-01T18-56-26-996695"], "y": [34.15271903970279], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "HF Baseline", "line": {"color": "#5b5f61", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": true, "hovertemplate": "<b>HF Baseline</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [171.7862353892997], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "torch.compile", "line": {"color": "#3b82f6", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": true, "hovertemplate": "<b>torch.compile</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [23.078908618772402], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "luminal backend", "line": {"color": "#a855f7", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": true, "hovertemplate": "<b>luminal backend</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [51.64], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "line": {"color": "#e8855a", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": true, "hovertemplate": "<b>Rust (luminal)</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}], {"plot_bgcolor": "#0d1416", "paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9"}, "margin": {"t": 16, "b": 48, "l": 52, "r": 12}, "height": 280, "xaxis": {"type": "category", "categoryorder": "array", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "tickangle": -30, "automargin": true, "zeroline": false, "categoryarray": ["2026-05-01T18-56-26-996695"], "tickvals": ["2026-05-01T18-56-26-996695"], "ticktext": ["May 01 \u00b7 18:56"]}, "yaxis": {"rangemode": "tozero", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "ticksuffix": " ms", "zeroline": false}, "legend": {"orientation": "h", "y": -0.28, "x": 0, "font": {"size": 11, "color": "#a1a4a5"}, "bgcolor": "rgba(0,0,0,0)"}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}},
+      {responsive: true, displayModeBar: false});
+  </script>
+</div><div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">qwen3-4b</span>
+  </div>
+  <div id="c_tpot_ms_qwen3_4b"></div>
+  <script>
+    Plotly.newPlot("c_tpot_ms_qwen3_4b", [{"x": ["2026-05-01T18-56-26-996695"], "y": [47.71483448566869], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "HF Baseline", "line": {"color": "#5b5f61", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>HF Baseline</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [468.56868775503244], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "torch.compile", "line": {"color": "#3b82f6", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>torch.compile</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [26.90318431414198], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "luminal backend", "line": {"color": "#a855f7", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>luminal backend</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [40.62], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "line": {"color": "#e8855a", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>Rust (luminal)</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}], {"plot_bgcolor": "#0d1416", "paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9"}, "margin": {"t": 16, "b": 16, "l": 52, "r": 12}, "height": 280, "xaxis": {"type": "category", "categoryorder": "array", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "tickangle": -30, "automargin": true, "zeroline": false, "categoryarray": ["2026-05-01T18-56-26-996695"], "tickvals": ["2026-05-01T18-56-26-996695"], "ticktext": ["May 01 \u00b7 18:56"]}, "yaxis": {"rangemode": "tozero", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "ticksuffix": " ms", "zeroline": false}, "legend": {"orientation": "h", "y": -0.28, "x": 0, "font": {"size": 11, "color": "#a1a4a5"}, "bgcolor": "rgba(0,0,0,0)", "visible": false}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}},
+      {responsive: true, displayModeBar: false});
+  </script>
+</div><div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">gemma3-4b</span>
+  </div>
+  <div id="c_tpot_ms_gemma3_4b"></div>
+  <script>
+    Plotly.newPlot("c_tpot_ms_gemma3_4b", [{"x": ["2026-05-01T18-56-26-996695"], "y": [52.498737201676704], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "HF Baseline", "line": {"color": "#5b5f61", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>HF Baseline</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [2197.426627812092], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "torch.compile", "line": {"color": "#3b82f6", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>torch.compile</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [38.99], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "line": {"color": "#e8855a", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>Rust (luminal)</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}], {"plot_bgcolor": "#0d1416", "paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9"}, "margin": {"t": 16, "b": 16, "l": 52, "r": 12}, "height": 280, "xaxis": {"type": "category", "categoryorder": "array", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "tickangle": -30, "automargin": true, "zeroline": false, "categoryarray": ["2026-05-01T18-56-26-996695"], "tickvals": ["2026-05-01T18-56-26-996695"], "ticktext": ["May 01 \u00b7 18:56"]}, "yaxis": {"rangemode": "tozero", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "ticksuffix": " ms", "zeroline": false}, "legend": {"orientation": "h", "y": -0.28, "x": 0, "font": {"size": 11, "color": "#a1a4a5"}, "bgcolor": "rgba(0,0,0,0)", "visible": false}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}},
+      {responsive: true, displayModeBar: false});
+  </script>
+</div><div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">gemma4-moe</span>
+  </div>
+  <div id="c_tpot_ms_gemma4_moe"></div>
+  <script>
+    Plotly.newPlot("c_tpot_ms_gemma4_moe", [{"x": ["2026-05-01T18-56-26-996695"], "y": [83.64427039632574], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "HF Baseline", "line": {"color": "#5b5f61", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>HF Baseline</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [654.9649795080768], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "torch.compile", "line": {"color": "#3b82f6", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>torch.compile</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}], {"plot_bgcolor": "#0d1416", "paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9"}, "margin": {"t": 16, "b": 16, "l": 52, "r": 12}, "height": 280, "xaxis": {"type": "category", "categoryorder": "array", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "tickangle": -30, "automargin": true, "zeroline": false, "categoryarray": ["2026-05-01T18-56-26-996695"], "tickvals": ["2026-05-01T18-56-26-996695"], "ticktext": ["May 01 \u00b7 18:56"]}, "yaxis": {"rangemode": "tozero", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "ticksuffix": " ms", "zeroline": false}, "legend": {"orientation": "h", "y": -0.28, "x": 0, "font": {"size": 11, "color": "#a1a4a5"}, "bgcolor": "rgba(0,0,0,0)", "visible": false}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}},
+      {responsive: true, displayModeBar: false});
+  </script>
+</div><div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">qwen3-moe</span>
+  </div>
+  <div id="c_tpot_ms_qwen3_moe"></div>
+  <script>
+    Plotly.newPlot("c_tpot_ms_qwen3_moe", [{"x": ["2026-05-01T18-56-26-996695"], "y": [84.527321747737], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "HF Baseline", "line": {"color": "#5b5f61", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>HF Baseline</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [753.0061075551203], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "torch.compile", "line": {"color": "#3b82f6", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>torch.compile</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [1166.8824461026816], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "luminal backend", "line": {"color": "#a855f7", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>luminal backend</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [60.08], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "line": {"color": "#e8855a", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>Rust (luminal)</b><br>%{customdata[1]}<br>%{y:.1f} ms<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}], {"plot_bgcolor": "#0d1416", "paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9"}, "margin": {"t": 16, "b": 16, "l": 52, "r": 12}, "height": 280, "xaxis": {"type": "category", "categoryorder": "array", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "tickangle": -30, "automargin": true, "zeroline": false, "categoryarray": ["2026-05-01T18-56-26-996695"], "tickvals": ["2026-05-01T18-56-26-996695"], "ticktext": ["May 01 \u00b7 18:56"]}, "yaxis": {"rangemode": "tozero", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "ticksuffix": " ms", "zeroline": false}, "legend": {"orientation": "h", "y": -0.28, "x": 0, "font": {"size": 11, "color": "#a1a4a5"}, "bgcolor": "rgba(0,0,0,0)", "visible": false}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}},
+      {responsive: true, displayModeBar: false});
+  </script>
+</div>
+  </div>
+</section>
+<section>
+  <div class="section-header">
+    <span class="section-eyebrow">metric</span>
+    <h2 class="section-title">Time to Search <span class="unit">over time</span></h2>
+    <span class="section-tag">Search time (sec)</span>
+  </div>
+  <div class="chart-grid" style="grid-template-columns: repeat(4, 1fr)">
+<div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">llama-8b</span>
+  </div>
+  <div id="c_compile_ms_llama_8b"></div>
+  <script>
+    Plotly.newPlot("c_compile_ms_llama_8b", [{"x": ["2026-05-01T18-56-26-996695"], "y": [18.760145067994017], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "torch.compile", "line": {"color": "#3b82f6", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": true, "hovertemplate": "<b>torch.compile</b><br>%{customdata[1]}<br>%{y:.1f} sec<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [95.96263545705006], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "luminal backend", "line": {"color": "#a855f7", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": true, "hovertemplate": "<b>luminal backend</b><br>%{customdata[1]}<br>%{y:.1f} sec<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [84.45343], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "line": {"color": "#e8855a", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": true, "hovertemplate": "<b>Rust (luminal)</b><br>%{customdata[1]}<br>%{y:.1f} sec<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}], {"plot_bgcolor": "#0d1416", "paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9"}, "margin": {"t": 16, "b": 48, "l": 52, "r": 12}, "height": 280, "xaxis": {"type": "category", "categoryorder": "array", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "tickangle": -30, "automargin": true, "zeroline": false, "categoryarray": ["2026-05-01T18-56-26-996695"], "tickvals": ["2026-05-01T18-56-26-996695"], "ticktext": ["May 01 \u00b7 18:56"]}, "yaxis": {"rangemode": "tozero", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "ticksuffix": " sec", "zeroline": false}, "legend": {"orientation": "h", "y": -0.28, "x": 0, "font": {"size": 11, "color": "#a1a4a5"}, "bgcolor": "rgba(0,0,0,0)"}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}},
+      {responsive: true, displayModeBar: false});
+  </script>
+</div><div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">qwen3-4b</span>
+  </div>
+  <div id="c_compile_ms_qwen3_4b"></div>
+  <script>
+    Plotly.newPlot("c_compile_ms_qwen3_4b", [{"x": ["2026-05-01T18-56-26-996695"], "y": [4.680963660997804], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "torch.compile", "line": {"color": "#3b82f6", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>torch.compile</b><br>%{customdata[1]}<br>%{y:.1f} sec<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [45.345814052037895], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "luminal backend", "line": {"color": "#a855f7", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>luminal backend</b><br>%{customdata[1]}<br>%{y:.1f} sec<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [19.92977], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "line": {"color": "#e8855a", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>Rust (luminal)</b><br>%{customdata[1]}<br>%{y:.1f} sec<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}], {"plot_bgcolor": "#0d1416", "paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9"}, "margin": {"t": 16, "b": 16, "l": 52, "r": 12}, "height": 280, "xaxis": {"type": "category", "categoryorder": "array", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "tickangle": -30, "automargin": true, "zeroline": false, "categoryarray": ["2026-05-01T18-56-26-996695"], "tickvals": ["2026-05-01T18-56-26-996695"], "ticktext": ["May 01 \u00b7 18:56"]}, "yaxis": {"rangemode": "tozero", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "ticksuffix": " sec", "zeroline": false}, "legend": {"orientation": "h", "y": -0.28, "x": 0, "font": {"size": 11, "color": "#a1a4a5"}, "bgcolor": "rgba(0,0,0,0)", "visible": false}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}},
+      {responsive: true, displayModeBar: false});
+  </script>
+</div><div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">gemma3-4b</span>
+  </div>
+  <div id="c_compile_ms_gemma3_4b"></div>
+  <script>
+    Plotly.newPlot("c_compile_ms_gemma3_4b", [{"x": ["2026-05-01T18-56-26-996695"], "y": [26.649526304972824], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "torch.compile", "line": {"color": "#3b82f6", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>torch.compile</b><br>%{customdata[1]}<br>%{y:.1f} sec<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [156.84164], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "line": {"color": "#e8855a", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>Rust (luminal)</b><br>%{customdata[1]}<br>%{y:.1f} sec<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}], {"plot_bgcolor": "#0d1416", "paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9"}, "margin": {"t": 16, "b": 16, "l": 52, "r": 12}, "height": 280, "xaxis": {"type": "category", "categoryorder": "array", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "tickangle": -30, "automargin": true, "zeroline": false, "categoryarray": ["2026-05-01T18-56-26-996695"], "tickvals": ["2026-05-01T18-56-26-996695"], "ticktext": ["May 01 \u00b7 18:56"]}, "yaxis": {"rangemode": "tozero", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "ticksuffix": " sec", "zeroline": false}, "legend": {"orientation": "h", "y": -0.28, "x": 0, "font": {"size": 11, "color": "#a1a4a5"}, "bgcolor": "rgba(0,0,0,0)", "visible": false}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}},
+      {responsive: true, displayModeBar: false});
+  </script>
+</div><div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">gemma4-moe</span>
+  </div>
+  <div id="c_compile_ms_gemma4_moe"></div>
+  <script>
+    Plotly.newPlot("c_compile_ms_gemma4_moe", [{"x": ["2026-05-01T18-56-26-996695"], "y": [38.81582092499593], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "torch.compile", "line": {"color": "#3b82f6", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>torch.compile</b><br>%{customdata[1]}<br>%{y:.1f} sec<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}], {"plot_bgcolor": "#0d1416", "paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9"}, "margin": {"t": 16, "b": 16, "l": 52, "r": 12}, "height": 280, "xaxis": {"type": "category", "categoryorder": "array", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "tickangle": -30, "automargin": true, "zeroline": false, "categoryarray": ["2026-05-01T18-56-26-996695"], "tickvals": ["2026-05-01T18-56-26-996695"], "ticktext": ["May 01 \u00b7 18:56"]}, "yaxis": {"rangemode": "tozero", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "ticksuffix": " sec", "zeroline": false}, "legend": {"orientation": "h", "y": -0.28, "x": 0, "font": {"size": 11, "color": "#a1a4a5"}, "bgcolor": "rgba(0,0,0,0)", "visible": false}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}},
+      {responsive: true, displayModeBar: false});
+  </script>
+</div><div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">qwen3-moe</span>
+  </div>
+  <div id="c_compile_ms_qwen3_moe"></div>
+  <script>
+    Plotly.newPlot("c_compile_ms_qwen3_moe", [{"x": ["2026-05-01T18-56-26-996695"], "y": [8.341281775035895], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "torch.compile", "line": {"color": "#3b82f6", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>torch.compile</b><br>%{customdata[1]}<br>%{y:.1f} sec<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [111.70731823903043], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "luminal backend", "line": {"color": "#a855f7", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>luminal backend</b><br>%{customdata[1]}<br>%{y:.1f} sec<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}, {"x": ["2026-05-01T18-56-26-996695"], "y": [80.83241000000001], "customdata": [["b2bd91f5", "2026-05-01T18:56:26.996695"]], "type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "line": {"color": "#e8855a", "width": 2}, "marker": {"size": 7, "symbol": "circle"}, "connectgaps": false, "showlegend": false, "hovertemplate": "<b>Rust (luminal)</b><br>%{customdata[1]}<br>%{y:.1f} sec<br><span style='color:#7e8385'>commit %{customdata[0]}</span><extra></extra>"}], {"plot_bgcolor": "#0d1416", "paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9"}, "margin": {"t": 16, "b": 16, "l": 52, "r": 12}, "height": 280, "xaxis": {"type": "category", "categoryorder": "array", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "tickangle": -30, "automargin": true, "zeroline": false, "categoryarray": ["2026-05-01T18-56-26-996695"], "tickvals": ["2026-05-01T18-56-26-996695"], "ticktext": ["May 01 \u00b7 18:56"]}, "yaxis": {"rangemode": "tozero", "color": "#5b5f61", "gridcolor": "#1c2225", "linecolor": "#2d3335", "tickfont": {"size": 11, "family": "Geist Mono, monospace"}, "ticksuffix": " sec", "zeroline": false}, "legend": {"orientation": "h", "y": -0.28, "x": 0, "font": {"size": 11, "color": "#a1a4a5"}, "bgcolor": "rgba(0,0,0,0)", "visible": false}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}},
+      {responsive: true, displayModeBar: false});
+  </script>
+</div>
+  </div>
+</section>
+<hr class='section-divider'>
+<section>
+  <div class="section-header">
+    <span class="section-eyebrow">sweep · 3d</span>
+    <h2 class="section-title">TTFT <span class="unit">vs search budget · over time</span></h2>
+    <span class="section-tag">1 run</span>
+  </div>
+  <p class="sweep-hint">Drag to rotate · scroll to zoom · each curve = one run</p>
+  <div class="chart-grid" style="grid-template-columns: repeat(4, 1fr)">
+<div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">llama-8b</span>
+  </div>
+  <div id="sw_ttft_ms_llama_8b"></div>
+  <script>
+    Plotly.newPlot("sw_ttft_ms_llama_8b", [{"type": "scatter3d", "mode": "lines+markers", "x": [10, 100, 500], "y": ["May 01", "May 01", "May 01"], "z": [470.7036415056791, 460.72837291285396, 472.43661794345826], "name": "luminal backend", "legendgroup": "python_luminal", "showlegend": true, "line": {"color": "#a855f7", "width": 5}, "marker": {"color": "#a855f7", "size": 4}, "hovertemplate": "<b>luminal backend</b><br>s=%{x} iters<br>%{z:.1f} ms<br>May 01 \u00b7 b2bd91f5<extra></extra>"}, {"type": "scatter3d", "mode": "lines+markers", "x": [10, 100, 500], "y": ["May 01", "May 01", "May 01"], "z": [751.03, 1038.34, 453.16], "name": "Rust (luminal)", "legendgroup": "rust", "showlegend": true, "line": {"color": "#e8855a", "width": 5}, "marker": {"color": "#e8855a", "size": 4}, "hovertemplate": "<b>Rust (luminal)</b><br>s=%{x} iters<br>%{z:.1f} ms<br>May 01 \u00b7 b2bd91f5<extra></extra>"}], {"paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9", "size": 11}, "height": 420, "margin": {"t": 20, "b": 0, "l": 0, "r": 0}, "legend": {"orientation": "h", "y": -0.05, "x": 0, "font": {"size": 11, "color": "#a1a4a5", "family": "Geist Mono, monospace"}, "bgcolor": "rgba(0,0,0,0)"}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}, "scene": {"bgcolor": "#0d1416", "xaxis": {"title": {"text": "search iters", "font": {"size": 10, "color": "#7e8385"}}, "type": "log", "tickvals": [5, 10, 20, 50, 100, 500], "ticktext": ["5", "10", "20", "50", "100", "500"], "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335", "zerolinecolor": "#2d3335"}, "yaxis": {"title": {"text": "run", "font": {"size": 10, "color": "#7e8385"}}, "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "zaxis": {"title": {"text": "ms", "font": {"size": 10, "color": "#7e8385"}}, "rangemode": "tozero", "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "ticksuffix": " ms", "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "camera": {"eye": {"x": 1.6, "y": -1.6, "z": 0.9}}}},
+      {responsive: true, displayModeBar: true, displaylogo: false,
+        modeBarButtonsToRemove: ["toImage","sendDataToCloud"]});
+  </script>
+</div><div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">qwen3-4b</span>
+  </div>
+  <div id="sw_ttft_ms_qwen3_4b"></div>
+  <script>
+    Plotly.newPlot("sw_ttft_ms_qwen3_4b", [{"type": "scatter3d", "mode": "lines+markers", "x": [10, 100, 500], "y": ["May 01", "May 01", "May 01"], "z": [465.02652901108377, 465.9317950136028, 495.75577257201076], "name": "luminal backend", "legendgroup": "python_luminal", "showlegend": true, "line": {"color": "#a855f7", "width": 5}, "marker": {"color": "#a855f7", "size": 4}, "hovertemplate": "<b>luminal backend</b><br>s=%{x} iters<br>%{z:.1f} ms<br>May 01 \u00b7 b2bd91f5<extra></extra>"}, {"type": "scatter3d", "mode": "lines+markers", "x": [10, 100, 500], "y": ["May 01", "May 01", "May 01"], "z": [398.44, 390.08, 559.29], "name": "Rust (luminal)", "legendgroup": "rust", "showlegend": true, "line": {"color": "#e8855a", "width": 5}, "marker": {"color": "#e8855a", "size": 4}, "hovertemplate": "<b>Rust (luminal)</b><br>s=%{x} iters<br>%{z:.1f} ms<br>May 01 \u00b7 b2bd91f5<extra></extra>"}], {"paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9", "size": 11}, "height": 420, "margin": {"t": 20, "b": 0, "l": 0, "r": 0}, "legend": {"orientation": "h", "y": -0.05, "x": 0, "font": {"size": 11, "color": "#a1a4a5", "family": "Geist Mono, monospace"}, "bgcolor": "rgba(0,0,0,0)"}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}, "scene": {"bgcolor": "#0d1416", "xaxis": {"title": {"text": "search iters", "font": {"size": 10, "color": "#7e8385"}}, "type": "log", "tickvals": [5, 10, 20, 50, 100, 500], "ticktext": ["5", "10", "20", "50", "100", "500"], "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335", "zerolinecolor": "#2d3335"}, "yaxis": {"title": {"text": "run", "font": {"size": 10, "color": "#7e8385"}}, "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "zaxis": {"title": {"text": "ms", "font": {"size": 10, "color": "#7e8385"}}, "rangemode": "tozero", "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "ticksuffix": " ms", "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "camera": {"eye": {"x": 1.6, "y": -1.6, "z": 0.9}}}},
+      {responsive: true, displayModeBar: true, displaylogo: false,
+        modeBarButtonsToRemove: ["toImage","sendDataToCloud"]});
+  </script>
+</div><div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">gemma3-4b</span>
+  </div>
+  <div id="sw_ttft_ms_gemma3_4b"></div>
+  <script>
+    Plotly.newPlot("sw_ttft_ms_gemma3_4b", [{"type": "scatter3d", "mode": "lines+markers", "x": [10, 100, 500], "y": ["May 01", "May 01", "May 01"], "z": [388.19, 436.49, 386.13], "name": "Rust (luminal)", "legendgroup": "rust", "showlegend": true, "line": {"color": "#e8855a", "width": 5}, "marker": {"color": "#e8855a", "size": 4}, "hovertemplate": "<b>Rust (luminal)</b><br>s=%{x} iters<br>%{z:.1f} ms<br>May 01 \u00b7 b2bd91f5<extra></extra>"}], {"paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9", "size": 11}, "height": 420, "margin": {"t": 20, "b": 0, "l": 0, "r": 0}, "legend": {"orientation": "h", "y": -0.05, "x": 0, "font": {"size": 11, "color": "#a1a4a5", "family": "Geist Mono, monospace"}, "bgcolor": "rgba(0,0,0,0)"}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}, "scene": {"bgcolor": "#0d1416", "xaxis": {"title": {"text": "search iters", "font": {"size": 10, "color": "#7e8385"}}, "type": "log", "tickvals": [5, 10, 20, 50, 100, 500], "ticktext": ["5", "10", "20", "50", "100", "500"], "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335", "zerolinecolor": "#2d3335"}, "yaxis": {"title": {"text": "run", "font": {"size": 10, "color": "#7e8385"}}, "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "zaxis": {"title": {"text": "ms", "font": {"size": 10, "color": "#7e8385"}}, "rangemode": "tozero", "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "ticksuffix": " ms", "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "camera": {"eye": {"x": 1.6, "y": -1.6, "z": 0.9}}}},
+      {responsive: true, displayModeBar: true, displaylogo: false,
+        modeBarButtonsToRemove: ["toImage","sendDataToCloud"]});
+  </script>
+</div><div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">qwen3-moe</span>
+  </div>
+  <div id="sw_ttft_ms_qwen3_moe"></div>
+  <script>
+    Plotly.newPlot("sw_ttft_ms_qwen3_moe", [{"type": "scatter3d", "mode": "lines+markers", "x": [10, 100, 500], "y": ["May 01", "May 01", "May 01"], "z": [21002.663500519702, 21018.686580006033, 21034.366824431345], "name": "luminal backend", "legendgroup": "python_luminal", "showlegend": true, "line": {"color": "#a855f7", "width": 5}, "marker": {"color": "#a855f7", "size": 4}, "hovertemplate": "<b>luminal backend</b><br>s=%{x} iters<br>%{z:.1f} ms<br>May 01 \u00b7 b2bd91f5<extra></extra>"}, {"type": "scatter3d", "mode": "lines+markers", "x": [10, 100, 500], "y": ["May 01", "May 01", "May 01"], "z": [656.7, 540.37, 542.34], "name": "Rust (luminal)", "legendgroup": "rust", "showlegend": true, "line": {"color": "#e8855a", "width": 5}, "marker": {"color": "#e8855a", "size": 4}, "hovertemplate": "<b>Rust (luminal)</b><br>s=%{x} iters<br>%{z:.1f} ms<br>May 01 \u00b7 b2bd91f5<extra></extra>"}], {"paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9", "size": 11}, "height": 420, "margin": {"t": 20, "b": 0, "l": 0, "r": 0}, "legend": {"orientation": "h", "y": -0.05, "x": 0, "font": {"size": 11, "color": "#a1a4a5", "family": "Geist Mono, monospace"}, "bgcolor": "rgba(0,0,0,0)"}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}, "scene": {"bgcolor": "#0d1416", "xaxis": {"title": {"text": "search iters", "font": {"size": 10, "color": "#7e8385"}}, "type": "log", "tickvals": [5, 10, 20, 50, 100, 500], "ticktext": ["5", "10", "20", "50", "100", "500"], "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335", "zerolinecolor": "#2d3335"}, "yaxis": {"title": {"text": "run", "font": {"size": 10, "color": "#7e8385"}}, "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "zaxis": {"title": {"text": "ms", "font": {"size": 10, "color": "#7e8385"}}, "rangemode": "tozero", "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "ticksuffix": " ms", "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "camera": {"eye": {"x": 1.6, "y": -1.6, "z": 0.9}}}},
+      {responsive: true, displayModeBar: true, displaylogo: false,
+        modeBarButtonsToRemove: ["toImage","sendDataToCloud"]});
+  </script>
+</div>
+  </div>
+</section>
+<section>
+  <div class="section-header">
+    <span class="section-eyebrow">sweep · 3d</span>
+    <h2 class="section-title">TPOT <span class="unit">vs search budget · over time</span></h2>
+    <span class="section-tag">1 run</span>
+  </div>
+  <p class="sweep-hint">Drag to rotate · scroll to zoom · each curve = one run</p>
+  <div class="chart-grid" style="grid-template-columns: repeat(4, 1fr)">
+<div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">llama-8b</span>
+  </div>
+  <div id="sw_tpot_ms_llama_8b"></div>
+  <script>
+    Plotly.newPlot("sw_tpot_ms_llama_8b", [{"type": "scatter3d", "mode": "lines+markers", "x": [10, 100, 500], "y": ["May 01", "May 01", "May 01"], "z": [23.540849717101082, 23.101884137140587, 23.610779400914907], "name": "luminal backend", "legendgroup": "python_luminal", "showlegend": true, "line": {"color": "#a855f7", "width": 5}, "marker": {"color": "#a855f7", "size": 4}, "hovertemplate": "<b>luminal backend</b><br>s=%{x} iters<br>%{z:.1f} ms<br>May 01 \u00b7 b2bd91f5<extra></extra>"}, {"type": "scatter3d", "mode": "lines+markers", "x": [10, 100, 500], "y": ["May 01", "May 01", "May 01"], "z": [38.2, 51.92, 24.09], "name": "Rust (luminal)", "legendgroup": "rust", "showlegend": true, "line": {"color": "#e8855a", "width": 5}, "marker": {"color": "#e8855a", "size": 4}, "hovertemplate": "<b>Rust (luminal)</b><br>s=%{x} iters<br>%{z:.1f} ms<br>May 01 \u00b7 b2bd91f5<extra></extra>"}], {"paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9", "size": 11}, "height": 420, "margin": {"t": 20, "b": 0, "l": 0, "r": 0}, "legend": {"orientation": "h", "y": -0.05, "x": 0, "font": {"size": 11, "color": "#a1a4a5", "family": "Geist Mono, monospace"}, "bgcolor": "rgba(0,0,0,0)"}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}, "scene": {"bgcolor": "#0d1416", "xaxis": {"title": {"text": "search iters", "font": {"size": 10, "color": "#7e8385"}}, "type": "log", "tickvals": [5, 10, 20, 50, 100, 500], "ticktext": ["5", "10", "20", "50", "100", "500"], "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335", "zerolinecolor": "#2d3335"}, "yaxis": {"title": {"text": "run", "font": {"size": 10, "color": "#7e8385"}}, "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "zaxis": {"title": {"text": "ms", "font": {"size": 10, "color": "#7e8385"}}, "rangemode": "tozero", "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "ticksuffix": " ms", "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "camera": {"eye": {"x": 1.6, "y": -1.6, "z": 0.9}}}},
+      {responsive: true, displayModeBar: true, displaylogo: false,
+        modeBarButtonsToRemove: ["toImage","sendDataToCloud"]});
+  </script>
+</div><div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">qwen3-4b</span>
+  </div>
+  <div id="sw_tpot_ms_qwen3_4b"></div>
+  <script>
+    Plotly.newPlot("sw_tpot_ms_qwen3_4b", [{"type": "scatter3d", "mode": "lines+markers", "x": [10, 100, 500], "y": ["May 01", "May 01", "May 01"], "z": [25.875402649398893, 25.884080055402592, 27.492373346467502], "name": "luminal backend", "legendgroup": "python_luminal", "showlegend": true, "line": {"color": "#a855f7", "width": 5}, "marker": {"color": "#a855f7", "size": 4}, "hovertemplate": "<b>luminal backend</b><br>s=%{x} iters<br>%{z:.1f} ms<br>May 01 \u00b7 b2bd91f5<extra></extra>"}, {"type": "scatter3d", "mode": "lines+markers", "x": [10, 100, 500], "y": ["May 01", "May 01", "May 01"], "z": [40.64, 39.98, 55.37], "name": "Rust (luminal)", "legendgroup": "rust", "showlegend": true, "line": {"color": "#e8855a", "width": 5}, "marker": {"color": "#e8855a", "size": 4}, "hovertemplate": "<b>Rust (luminal)</b><br>s=%{x} iters<br>%{z:.1f} ms<br>May 01 \u00b7 b2bd91f5<extra></extra>"}], {"paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9", "size": 11}, "height": 420, "margin": {"t": 20, "b": 0, "l": 0, "r": 0}, "legend": {"orientation": "h", "y": -0.05, "x": 0, "font": {"size": 11, "color": "#a1a4a5", "family": "Geist Mono, monospace"}, "bgcolor": "rgba(0,0,0,0)"}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}, "scene": {"bgcolor": "#0d1416", "xaxis": {"title": {"text": "search iters", "font": {"size": 10, "color": "#7e8385"}}, "type": "log", "tickvals": [5, 10, 20, 50, 100, 500], "ticktext": ["5", "10", "20", "50", "100", "500"], "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335", "zerolinecolor": "#2d3335"}, "yaxis": {"title": {"text": "run", "font": {"size": 10, "color": "#7e8385"}}, "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "zaxis": {"title": {"text": "ms", "font": {"size": 10, "color": "#7e8385"}}, "rangemode": "tozero", "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "ticksuffix": " ms", "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "camera": {"eye": {"x": 1.6, "y": -1.6, "z": 0.9}}}},
+      {responsive: true, displayModeBar: true, displaylogo: false,
+        modeBarButtonsToRemove: ["toImage","sendDataToCloud"]});
+  </script>
+</div><div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">gemma3-4b</span>
+  </div>
+  <div id="sw_tpot_ms_gemma3_4b"></div>
+  <script>
+    Plotly.newPlot("sw_tpot_ms_gemma3_4b", [{"type": "scatter3d", "mode": "lines+markers", "x": [10, 100, 500], "y": ["May 01", "May 01", "May 01"], "z": [37.47, 41.95, 37.25], "name": "Rust (luminal)", "legendgroup": "rust", "showlegend": true, "line": {"color": "#e8855a", "width": 5}, "marker": {"color": "#e8855a", "size": 4}, "hovertemplate": "<b>Rust (luminal)</b><br>s=%{x} iters<br>%{z:.1f} ms<br>May 01 \u00b7 b2bd91f5<extra></extra>"}], {"paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9", "size": 11}, "height": 420, "margin": {"t": 20, "b": 0, "l": 0, "r": 0}, "legend": {"orientation": "h", "y": -0.05, "x": 0, "font": {"size": 11, "color": "#a1a4a5", "family": "Geist Mono, monospace"}, "bgcolor": "rgba(0,0,0,0)"}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}, "scene": {"bgcolor": "#0d1416", "xaxis": {"title": {"text": "search iters", "font": {"size": 10, "color": "#7e8385"}}, "type": "log", "tickvals": [5, 10, 20, 50, 100, 500], "ticktext": ["5", "10", "20", "50", "100", "500"], "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335", "zerolinecolor": "#2d3335"}, "yaxis": {"title": {"text": "run", "font": {"size": 10, "color": "#7e8385"}}, "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "zaxis": {"title": {"text": "ms", "font": {"size": 10, "color": "#7e8385"}}, "rangemode": "tozero", "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "ticksuffix": " ms", "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "camera": {"eye": {"x": 1.6, "y": -1.6, "z": 0.9}}}},
+      {responsive: true, displayModeBar: true, displaylogo: false,
+        modeBarButtonsToRemove: ["toImage","sendDataToCloud"]});
+  </script>
+</div><div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">qwen3-moe</span>
+  </div>
+  <div id="sw_tpot_ms_qwen3_moe"></div>
+  <script>
+    Plotly.newPlot("sw_tpot_ms_qwen3_moe", [{"type": "scatter3d", "mode": "lines+markers", "x": [10, 100, 500], "y": ["May 01", "May 01", "May 01"], "z": [1166.6714247548953, 1167.2746865515364, 1168.7990181031637], "name": "luminal backend", "legendgroup": "python_luminal", "showlegend": true, "line": {"color": "#a855f7", "width": 5}, "marker": {"color": "#a855f7", "size": 4}, "hovertemplate": "<b>luminal backend</b><br>s=%{x} iters<br>%{z:.1f} ms<br>May 01 \u00b7 b2bd91f5<extra></extra>"}, {"type": "scatter3d", "mode": "lines+markers", "x": [10, 100, 500], "y": ["May 01", "May 01", "May 01"], "z": [59.6, 48.79, 48.88], "name": "Rust (luminal)", "legendgroup": "rust", "showlegend": true, "line": {"color": "#e8855a", "width": 5}, "marker": {"color": "#e8855a", "size": 4}, "hovertemplate": "<b>Rust (luminal)</b><br>s=%{x} iters<br>%{z:.1f} ms<br>May 01 \u00b7 b2bd91f5<extra></extra>"}], {"paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9", "size": 11}, "height": 420, "margin": {"t": 20, "b": 0, "l": 0, "r": 0}, "legend": {"orientation": "h", "y": -0.05, "x": 0, "font": {"size": 11, "color": "#a1a4a5", "family": "Geist Mono, monospace"}, "bgcolor": "rgba(0,0,0,0)"}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}, "scene": {"bgcolor": "#0d1416", "xaxis": {"title": {"text": "search iters", "font": {"size": 10, "color": "#7e8385"}}, "type": "log", "tickvals": [5, 10, 20, 50, 100, 500], "ticktext": ["5", "10", "20", "50", "100", "500"], "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335", "zerolinecolor": "#2d3335"}, "yaxis": {"title": {"text": "run", "font": {"size": 10, "color": "#7e8385"}}, "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "zaxis": {"title": {"text": "ms", "font": {"size": 10, "color": "#7e8385"}}, "rangemode": "tozero", "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "ticksuffix": " ms", "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "camera": {"eye": {"x": 1.6, "y": -1.6, "z": 0.9}}}},
+      {responsive: true, displayModeBar: true, displaylogo: false,
+        modeBarButtonsToRemove: ["toImage","sendDataToCloud"]});
+  </script>
+</div>
+  </div>
+</section>
+<section>
+  <div class="section-header">
+    <span class="section-eyebrow">sweep · 3d</span>
+    <h2 class="section-title">Time to Search <span class="unit">vs search budget · over time</span></h2>
+    <span class="section-tag">1 run</span>
+  </div>
+  <p class="sweep-hint">Drag to rotate · scroll to zoom · each curve = one run</p>
+  <div class="chart-grid" style="grid-template-columns: repeat(4, 1fr)">
+<div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">llama-8b</span>
+  </div>
+  <div id="sw_compile_ms_llama_8b"></div>
+  <script>
+    Plotly.newPlot("sw_compile_ms_llama_8b", [{"type": "scatter3d", "mode": "lines+markers", "x": [10, 100, 500], "y": ["May 01", "May 01", "May 01"], "z": [28.428826077957638, 43.57440591201885, 95.52432684396626], "name": "luminal backend", "legendgroup": "python_luminal", "showlegend": true, "line": {"color": "#a855f7", "width": 5}, "marker": {"color": "#a855f7", "size": 4}, "hovertemplate": "<b>luminal backend</b><br>s=%{x} iters<br>%{z:.1f} sec<br>May 01 \u00b7 b2bd91f5<extra></extra>"}, {"type": "scatter3d", "mode": "lines+markers", "x": [10, 100, 500], "y": ["May 01", "May 01", "May 01"], "z": [15.14307, 30.12727, 84.87889], "name": "Rust (luminal)", "legendgroup": "rust", "showlegend": true, "line": {"color": "#e8855a", "width": 5}, "marker": {"color": "#e8855a", "size": 4}, "hovertemplate": "<b>Rust (luminal)</b><br>s=%{x} iters<br>%{z:.1f} sec<br>May 01 \u00b7 b2bd91f5<extra></extra>"}], {"paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9", "size": 11}, "height": 420, "margin": {"t": 20, "b": 0, "l": 0, "r": 0}, "legend": {"orientation": "h", "y": -0.05, "x": 0, "font": {"size": 11, "color": "#a1a4a5", "family": "Geist Mono, monospace"}, "bgcolor": "rgba(0,0,0,0)"}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}, "scene": {"bgcolor": "#0d1416", "xaxis": {"title": {"text": "search iters", "font": {"size": 10, "color": "#7e8385"}}, "type": "log", "tickvals": [5, 10, 20, 50, 100, 500], "ticktext": ["5", "10", "20", "50", "100", "500"], "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335", "zerolinecolor": "#2d3335"}, "yaxis": {"title": {"text": "run", "font": {"size": 10, "color": "#7e8385"}}, "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "zaxis": {"title": {"text": "sec", "font": {"size": 10, "color": "#7e8385"}}, "rangemode": "tozero", "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "ticksuffix": " sec", "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "camera": {"eye": {"x": 1.6, "y": -1.6, "z": 0.9}}}},
+      {responsive: true, displayModeBar: true, displaylogo: false,
+        modeBarButtonsToRemove: ["toImage","sendDataToCloud"]});
+  </script>
+</div><div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">qwen3-4b</span>
+  </div>
+  <div id="sw_compile_ms_qwen3_4b"></div>
+  <script>
+    Plotly.newPlot("sw_compile_ms_qwen3_4b", [{"type": "scatter3d", "mode": "lines+markers", "x": [10, 100, 500], "y": ["May 01", "May 01", "May 01"], "z": [37.92102829599753, 54.08867314597592, 118.29659596900456], "name": "luminal backend", "legendgroup": "python_luminal", "showlegend": true, "line": {"color": "#a855f7", "width": 5}, "marker": {"color": "#a855f7", "size": 4}, "hovertemplate": "<b>luminal backend</b><br>s=%{x} iters<br>%{z:.1f} sec<br>May 01 \u00b7 b2bd91f5<extra></extra>"}, {"type": "scatter3d", "mode": "lines+markers", "x": [10, 100, 500], "y": ["May 01", "May 01", "May 01"], "z": [12.448030000000001, 27.06796, 81.89342], "name": "Rust (luminal)", "legendgroup": "rust", "showlegend": true, "line": {"color": "#e8855a", "width": 5}, "marker": {"color": "#e8855a", "size": 4}, "hovertemplate": "<b>Rust (luminal)</b><br>s=%{x} iters<br>%{z:.1f} sec<br>May 01 \u00b7 b2bd91f5<extra></extra>"}], {"paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9", "size": 11}, "height": 420, "margin": {"t": 20, "b": 0, "l": 0, "r": 0}, "legend": {"orientation": "h", "y": -0.05, "x": 0, "font": {"size": 11, "color": "#a1a4a5", "family": "Geist Mono, monospace"}, "bgcolor": "rgba(0,0,0,0)"}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}, "scene": {"bgcolor": "#0d1416", "xaxis": {"title": {"text": "search iters", "font": {"size": 10, "color": "#7e8385"}}, "type": "log", "tickvals": [5, 10, 20, 50, 100, 500], "ticktext": ["5", "10", "20", "50", "100", "500"], "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335", "zerolinecolor": "#2d3335"}, "yaxis": {"title": {"text": "run", "font": {"size": 10, "color": "#7e8385"}}, "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "zaxis": {"title": {"text": "sec", "font": {"size": 10, "color": "#7e8385"}}, "rangemode": "tozero", "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "ticksuffix": " sec", "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "camera": {"eye": {"x": 1.6, "y": -1.6, "z": 0.9}}}},
+      {responsive: true, displayModeBar: true, displaylogo: false,
+        modeBarButtonsToRemove: ["toImage","sendDataToCloud"]});
+  </script>
+</div><div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">gemma3-4b</span>
+  </div>
+  <div id="sw_compile_ms_gemma3_4b"></div>
+  <script>
+    Plotly.newPlot("sw_compile_ms_gemma3_4b", [{"type": "scatter3d", "mode": "lines+markers", "x": [10, 100, 500], "y": ["May 01", "May 01", "May 01"], "z": [102.18644, 186.34269, 498.48983000000004], "name": "Rust (luminal)", "legendgroup": "rust", "showlegend": true, "line": {"color": "#e8855a", "width": 5}, "marker": {"color": "#e8855a", "size": 4}, "hovertemplate": "<b>Rust (luminal)</b><br>s=%{x} iters<br>%{z:.1f} sec<br>May 01 \u00b7 b2bd91f5<extra></extra>"}], {"paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9", "size": 11}, "height": 420, "margin": {"t": 20, "b": 0, "l": 0, "r": 0}, "legend": {"orientation": "h", "y": -0.05, "x": 0, "font": {"size": 11, "color": "#a1a4a5", "family": "Geist Mono, monospace"}, "bgcolor": "rgba(0,0,0,0)"}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}, "scene": {"bgcolor": "#0d1416", "xaxis": {"title": {"text": "search iters", "font": {"size": 10, "color": "#7e8385"}}, "type": "log", "tickvals": [5, 10, 20, 50, 100, 500], "ticktext": ["5", "10", "20", "50", "100", "500"], "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335", "zerolinecolor": "#2d3335"}, "yaxis": {"title": {"text": "run", "font": {"size": 10, "color": "#7e8385"}}, "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "zaxis": {"title": {"text": "sec", "font": {"size": 10, "color": "#7e8385"}}, "rangemode": "tozero", "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "ticksuffix": " sec", "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "camera": {"eye": {"x": 1.6, "y": -1.6, "z": 0.9}}}},
+      {responsive: true, displayModeBar: true, displaylogo: false,
+        modeBarButtonsToRemove: ["toImage","sendDataToCloud"]});
+  </script>
+</div><div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">qwen3-moe</span>
+  </div>
+  <div id="sw_compile_ms_qwen3_moe"></div>
+  <script>
+    Plotly.newPlot("sw_compile_ms_qwen3_moe", [{"type": "scatter3d", "mode": "lines+markers", "x": [10, 100, 500], "y": ["May 01", "May 01", "May 01"], "z": [93.47603664599592, 132.266081985028, 298.05094401398674], "name": "luminal backend", "legendgroup": "python_luminal", "showlegend": true, "line": {"color": "#a855f7", "width": 5}, "marker": {"color": "#a855f7", "size": 4}, "hovertemplate": "<b>luminal backend</b><br>s=%{x} iters<br>%{z:.1f} sec<br>May 01 \u00b7 b2bd91f5<extra></extra>"}, {"type": "scatter3d", "mode": "lines+markers", "x": [10, 100, 500], "y": ["May 01", "May 01", "May 01"], "z": [25.48138, 47.5342, 134.79345], "name": "Rust (luminal)", "legendgroup": "rust", "showlegend": true, "line": {"color": "#e8855a", "width": 5}, "marker": {"color": "#e8855a", "size": 4}, "hovertemplate": "<b>Rust (luminal)</b><br>s=%{x} iters<br>%{z:.1f} sec<br>May 01 \u00b7 b2bd91f5<extra></extra>"}], {"paper_bgcolor": "#141b1d", "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9", "size": 11}, "height": 420, "margin": {"t": 20, "b": 0, "l": 0, "r": 0}, "legend": {"orientation": "h", "y": -0.05, "x": 0, "font": {"size": 11, "color": "#a1a4a5", "family": "Geist Mono, monospace"}, "bgcolor": "rgba(0,0,0,0)"}, "hoverlabel": {"bgcolor": "#1c2225", "bordercolor": "#2d3335", "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"}}, "scene": {"bgcolor": "#0d1416", "xaxis": {"title": {"text": "search iters", "font": {"size": 10, "color": "#7e8385"}}, "type": "log", "tickvals": [5, 10, 20, 50, 100, 500], "ticktext": ["5", "10", "20", "50", "100", "500"], "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335", "zerolinecolor": "#2d3335"}, "yaxis": {"title": {"text": "run", "font": {"size": 10, "color": "#7e8385"}}, "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "zaxis": {"title": {"text": "sec", "font": {"size": 10, "color": "#7e8385"}}, "rangemode": "tozero", "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"}, "ticksuffix": " sec", "gridcolor": "#1c2225", "linecolor": "#2d3335"}, "camera": {"eye": {"x": 1.6, "y": -1.6, "z": 0.9}}}},
+      {responsive: true, displayModeBar: true, displaylogo: false,
+        modeBarButtonsToRemove: ["toImage","sendDataToCloud"]});
+  </script>
+</div>
+  </div>
+</section>
+
+</main>
+
+<footer>
+  <span>luminal · benchmark dashboard</span>
+  <span>generated May 01, 2026 · 18:56</span>
+</footer>
+
+</body>
+</html>
--- a/benchmarks/ttft/db.py
+++ b/benchmarks/ttft/db.py
@@ -0,0 +1,242 @@
+"""SQLite persistence for TTFT/TPOT benchmark runs.
+
+Two tables:
+  runs    — one row per orchestrator invocation
+  results — many rows per run, one per (path, config) combination
+
+`results` carries every field that today's BENCH_RESULT JSON record carries.
+Per-iteration sample arrays (`ttft_ms_samples`, `tpot_ms_samples`) are kept as
+JSON TEXT — they're archival, no consumer aggregates over them.
+
+The default DB path is benchmarks/ttft/bench.db (gitignored). Schema is
+created lazily on first connect.
+"""
+
+from __future__ import annotations
+
+import json
+import sqlite3
+from pathlib import Path
+from typing import Any, Iterable
+
+BENCH_DIR = Path(__file__).resolve().parent
+DEFAULT_DB_PATH = BENCH_DIR / "bench.db"
+
+
+_SCHEMA = """
+CREATE TABLE IF NOT EXISTS runs (
+  run_id        TEXT PRIMARY KEY,
+  timestamp     TEXT NOT NULL,
+  git_commit    TEXT,
+  git_branch    TEXT,
+  gpu_name      TEXT,
+  gpu_driver    TEXT,
+  gpu_vram_mb   INTEGER,
+  cuda_version  TEXT,
+  mode          TEXT NOT NULL  -- 'single' | 'all-configs' | 'search-sweep' | 'ur-test' | 'ur-test-fast'
+);
+
+CREATE TABLE IF NOT EXISTS results (
+  id              INTEGER PRIMARY KEY AUTOINCREMENT,
+  run_id          TEXT NOT NULL REFERENCES runs(run_id) ON DELETE CASCADE,
+  path            TEXT NOT NULL,
+  model           TEXT NOT NULL,
+  model_key       TEXT,
+  config          TEXT NOT NULL,
+  device          TEXT,
+  dtype           TEXT,
+  prompt_tokens   INTEGER,
+  iters           INTEGER,
+  decode_tokens   INTEGER,
+  search_iters    INTEGER,
+  ttft_ms         REAL,
+  ttft_ms_mean    REAL,
+  tpot_ms         REAL,
+  throughput_tps  REAL,
+  compile_ms      REAL,
+  note            TEXT,
+  error           TEXT,
+  ttft_ms_samples TEXT,
+  tpot_ms_samples TEXT,
+  created_at      TEXT NOT NULL DEFAULT (datetime('now'))
+);
+
+CREATE INDEX IF NOT EXISTS idx_results_run    ON results(run_id);
+CREATE INDEX IF NOT EXISTS idx_results_path   ON results(path);
+CREATE INDEX IF NOT EXISTS idx_results_config ON results(config);
+CREATE INDEX IF NOT EXISTS idx_results_modelk ON results(model_key);
+"""
+
+
+# Columns that map 1:1 from a BENCH_RESULT record dict into `results`.
+_SCALAR_RESULT_COLS = (
+    "path", "model", "model_key", "config",
+    "device", "dtype",
+    "prompt_tokens", "iters", "decode_tokens", "search_iters",
+    "ttft_ms", "ttft_ms_mean", "tpot_ms", "throughput_tps", "compile_ms",
+    "note", "error",
+)
+_SAMPLE_COLS = ("ttft_ms_samples", "tpot_ms_samples")
+_ALL_RESULT_COLS = ("run_id",) + _SCALAR_RESULT_COLS + _SAMPLE_COLS
+
+
+def connect(path: str | Path = DEFAULT_DB_PATH) -> sqlite3.Connection:
+    """Open (or create) the bench DB and ensure the schema exists."""
+    p = Path(path)
+    p.parent.mkdir(parents=True, exist_ok=True)
+    conn = sqlite3.connect(p)
+    conn.row_factory = sqlite3.Row
+    conn.execute("PRAGMA foreign_keys = ON")
+    conn.executescript(_SCHEMA)
+    return conn
+
+
+def insert_run(
+    conn: sqlite3.Connection,
+    *,
+    run_id: str,
+    timestamp: str,
+    mode: str,
+    git_commit: str | None = None,
+    git_branch: str | None = None,
+    gpu_name: str | None = None,
+    gpu_driver: str | None = None,
+    gpu_vram_mb: int | None = None,
+    cuda_version: str | None = None,
+    if_exists: str = "ignore",
+) -> str:
+    """Insert a run row.  if_exists='ignore' (default) leaves an existing
+    row untouched; 'replace' overwrites."""
+    verb = {"ignore": "INSERT OR IGNORE", "replace": "INSERT OR REPLACE"}[if_exists]
+    conn.execute(
+        f"""{verb} INTO runs
+            (run_id, timestamp, git_commit, git_branch,
+             gpu_name, gpu_driver, gpu_vram_mb, cuda_version, mode)
+            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
+        (run_id, timestamp, git_commit, git_branch,
+         gpu_name, gpu_driver, gpu_vram_mb, cuda_version, mode),
+    )
+    return run_id
+
+
+def insert_result(conn: sqlite3.Connection, run_id: str, record: dict[str, Any]) -> int:
+    """Insert one BENCH_RESULT-shaped record under the given run_id."""
+    values = [run_id]
+    for col in _SCALAR_RESULT_COLS:
+        values.append(record.get(col))
+    for col in _SAMPLE_COLS:
+        v = record.get(col)
+        values.append(json.dumps(v) if v is not None else None)
+    placeholders = ", ".join(["?"] * len(_ALL_RESULT_COLS))
+    cols = ", ".join(_ALL_RESULT_COLS)
+    cur = conn.execute(
+        f"INSERT INTO results ({cols}) VALUES ({placeholders})",
+        values,
+    )
+    return cur.lastrowid
+
+
+def insert_results(conn: sqlite3.Connection, run_id: str, records: Iterable[dict[str, Any]]) -> int:
+    """Bulk-insert; returns count."""
+    n = 0
+    for r in records:
+        insert_result(conn, run_id, r)
+        n += 1
+    return n
+
+
+def latest_run_id(conn: sqlite3.Connection) -> str | None:
+    row = conn.execute(
+        "SELECT run_id FROM runs ORDER BY timestamp DESC, run_id DESC LIMIT 1"
+    ).fetchone()
+    return row["run_id"] if row else None
+
+
+def load_run(conn: sqlite3.Connection, run_id: str) -> dict[str, Any] | None:
+    row = conn.execute("SELECT * FROM runs WHERE run_id = ?", (run_id,)).fetchone()
+    return dict(row) if row else None
+
+
+def load_runs(conn: sqlite3.Connection) -> list[dict[str, Any]]:
+    """All runs, oldest → newest."""
+    rows = conn.execute(
+        "SELECT * FROM runs ORDER BY timestamp ASC, run_id ASC"
+    ).fetchall()
+    return [dict(r) for r in rows]
+
+
+def _row_to_record(row: sqlite3.Row) -> dict[str, Any]:
+    """Convert a results row into a BENCH_RESULT-shaped dict, stripping NULLs
+    so consumers see the same shape they did with JSON."""
+    out: dict[str, Any] = {}
+    for col in _SCALAR_RESULT_COLS:
+        v = row[col]
+        if v is not None:
+            out[col] = v
+    for col in _SAMPLE_COLS:
+        v = row[col]
+        if v is not None:
+            out[col] = json.loads(v)
+    return out
+
+
+def load_results(conn: sqlite3.Connection, run_id: str) -> list[dict[str, Any]]:
+    """All results for one run, in insertion order."""
+    rows = conn.execute(
+        "SELECT * FROM results WHERE run_id = ? ORDER BY id ASC", (run_id,)
+    ).fetchall()
+    return [_row_to_record(r) for r in rows]
+
+
+def load_history(conn: sqlite3.Connection) -> list[dict[str, Any]]:
+    """Mirror the legacy gen_dashboard.load_history() shape:
+    [{"meta": {...}, "results": [...], "sweep": [...]}], sorted oldest→newest.
+    Splits results vs sweep by config-startswith('s=')."""
+    out = []
+    for run in load_runs(conn):
+        run_id = run["run_id"]
+        meta = {
+            "run_id":      run_id,
+            "timestamp":   run["timestamp"],
+            "git_commit":  run["git_commit"] or "?",
+            "git_branch":  run["git_branch"] or "?",
+        }
+        if run["gpu_name"] is not None:
+            meta["gpu_name"] = run["gpu_name"]
+        if run["gpu_driver"] is not None:
+            meta["gpu_driver"] = run["gpu_driver"]
+        if run["gpu_vram_mb"] is not None:
+            meta["gpu_vram_mb"] = run["gpu_vram_mb"]
+        if run["cuda_version"] is not None:
+            meta["cuda_version"] = run["cuda_version"]
+
+        records = load_results(conn, run_id)
+        comparison, sweep = [], []
+        for r in records:
+            (sweep if r.get("config", "").startswith("s=") else comparison).append(r)
+        out.append({"meta": meta, "results": comparison, "sweep": sweep})
+    return out
+
+
+# ── self-test ────────────────────────────────────────────────────────────────
+
+if __name__ == "__main__":
+    # In-memory smoke test: round-trip one record.
+    conn = sqlite3.connect(":memory:")
+    conn.row_factory = sqlite3.Row
+    conn.executescript(_SCHEMA)
+    insert_run(conn, run_id="test", timestamp="2026-04-27T00:00:00", mode="single")
+    insert_result(conn, "test", {
+        "path": "rust",
+        "model": "test-model",
+        "config": "default",
+        "ttft_ms": 12.34,
+        "ttft_ms_samples": [12.0, 12.5, 12.3],
+        "search_iters": 500,
+    })
+    [row] = load_results(conn, "test")
+    assert row["path"] == "rust", row
+    assert row["ttft_ms"] == 12.34, row
+    assert row["ttft_ms_samples"] == [12.0, 12.5, 12.3], row
+    assert latest_run_id(conn) == "test"
+    print("db.py smoke test ok")
--- a/benchmarks/ttft/gen_dashboard.py
+++ b/benchmarks/ttft/gen_dashboard.py
@@ -0,0 +1,832 @@
+"""Time-series benchmark dashboard generator.
+
+Reads every run from the SQLite DB (benchmarks/ttft/bench.db) and produces a
+single standalone HTML file with Plotly.js charts styled to match luminal.com.
+
+Layout:
+  TTFT over time  →  one chart per model, lines = execution paths
+  TPOT over time  →  same
+
+Usage:
+  python3 benchmarks/ttft/gen_dashboard.py [--db PATH] [--out FILE]
+"""
+
+import argparse
+import json
+from datetime import datetime
+from pathlib import Path
+
+import db
+
+BENCH_DIR = Path(__file__).resolve().parent
+
+# Path colours – kept distinct against the dark green Luminal accent
+PATH_COLORS = {
+    "python_baseline":      "#5b5f61",  # muted slate
+    "python_torch_compile": "#3b82f6",  # blue (luminal accent palette)
+    "python_luminal":       "#a855f7",  # purple (luminal accent palette)
+    "rust":                 "#e8855a",  # warm orange – Rust brand feel
+}
+PATH_LABELS = {
+    "python_baseline":      "HF Baseline",
+    "python_torch_compile": "torch.compile",
+    "python_luminal":       "luminal backend",
+    "rust":                 "Rust (luminal)",
+}
+PATH_ORDER = ["python_baseline", "python_torch_compile", "python_luminal", "rust"]
+
+# (key, short label, y-axis label, scale, axis ticksuffix)
+# scale is applied to raw value before plotting (e.g. ms → sec via 0.001).
+METRICS = [
+    ("ttft_ms",    "TTFT",            "Time to first token (ms)",   1.0,   " ms"),
+    ("tpot_ms",    "TPOT",            "Time per output token (ms)", 1.0,   " ms"),
+    ("compile_ms", "Time to Search",  "Search time (sec)",          0.001, " sec"),
+]
+
+
+# ── data loading ─────────────────────────────────────────────────────────────
+
+def load_history(db_path: Path) -> list[dict]:
+    """Return [{"meta", "results", "sweep"}, …] from the bench DB,
+    oldest→newest. Same shape the legacy JSON loader returned."""
+    if not Path(db_path).exists():
+        return []
+    conn = db.connect(db_path)
+    return db.load_history(conn)
+
+
+def build_series(runs: list[dict]) -> tuple[dict, list[str], list[str]]:
+    """Returns (data, run_ids, run_labels).
+
+    - data[model][path][metric] = [(run_id, value, commit, ts), ...]
+      `run_id` is the categorical x value; `ts` is kept for tooltip formatting.
+    - run_ids: chronological list of every run that appears in the comparison data.
+    - run_labels: parallel to run_ids; "MMM DD · HH:MM" for nice axis ticks.
+
+    The categorical x-axis (one column per run_id) replaces the previous
+    `type: date` axis. With multiple runs on the same day, the date axis
+    silently stacked them on one column; the category axis spaces them
+    evenly so each run is visually distinct.
+    """
+    data: dict = {}
+    seen_run_ids: list[str] = []
+    seen_ts: dict[str, str] = {}
+
+    for run in runs:
+        run_id = run["meta"]["run_id"]
+        ts = run["meta"]["timestamp"]
+        commit = run["meta"].get("git_commit", "?")
+        had_data = False
+        for r in run["results"]:
+            if r.get("error") or r.get("ttft_ms") is None:
+                continue
+            model = r.get("config", r.get("model", "unknown"))
+            path = r.get("path", "unknown")
+            data.setdefault(model, {}).setdefault(path, {})
+            for metric, _, _, scale, _ in METRICS:
+                val = r.get(metric)
+                if val is not None:
+                    data[model][path].setdefault(metric, []).append(
+                        (run_id, val * scale, commit, ts)
+                    )
+                    had_data = True
+        if had_data and run_id not in seen_ts:
+            seen_run_ids.append(run_id)
+            seen_ts[run_id] = ts
+
+    run_ids = sorted(seen_run_ids, key=lambda rid: seen_ts.get(rid, rid))
+    run_labels = []
+    for rid in run_ids:
+        ts = seen_ts.get(rid, rid)
+        try:
+            run_labels.append(datetime.fromisoformat(ts).strftime("%b %d · %H:%M"))
+        except ValueError:
+            run_labels.append(rid[:16].replace("T", " "))
+    return data, run_ids, run_labels
+
+
+def build_sweep_series(runs: list[dict]) -> tuple[dict, list[str]]:
+    """Collect sweep records from ALL runs for 3D charting.
+
+    Returns:
+      data[model_key][path][metric][run_id] = {
+          "label":  str,              # short date label for Y axis
+          "commit": str,
+          "points": [(iters, ms), …]  # sorted by iters
+      }
+      run_ids: list[str] in chronological order (oldest → newest)
+    """
+    data: dict = {}
+    run_ids: list[str] = []
+
+    for run in runs:
+        if not run.get("sweep"):
+            continue
+        run_id = run["meta"]["run_id"]
+        commit = run["meta"].get("git_commit", "?")
+        try:
+            label = datetime.fromisoformat(run["meta"]["timestamp"]).strftime("%b %d")
+        except ValueError:
+            label = run_id[:10]
+        if run_id not in run_ids:
+            run_ids.append(run_id)
+
+        for r in run["sweep"]:
+            if r.get("error"):
+                continue
+            n = r.get("search_iters")
+            if n is None:
+                cfg = r.get("config", "")
+                if cfg.startswith("s="):
+                    try:
+                        n = int(cfg[2:])
+                    except ValueError:
+                        continue
+            if n is None:
+                continue
+            model_key = r.get("model_key", "unknown")
+            path = r.get("path", "unknown")
+            for metric, _, _, scale, _ in METRICS:
+                val = r.get(metric)
+                if val is None:
+                    continue
+                (data
+                    .setdefault(model_key, {})
+                    .setdefault(path, {})
+                    .setdefault(metric, {})
+                    .setdefault(run_id, {"label": label, "commit": commit, "points": []})
+                    ["points"].append((n, val * scale)))
+
+    # Sort points within each run by search_iters
+    for mk in data:
+        for path in data[mk]:
+            for metric in data[mk][path]:
+                for run_id in data[mk][path][metric]:
+                    data[mk][path][metric][run_id]["points"].sort(key=lambda x: x[0])
+
+    return data, run_ids
+
+
+# ── chart building ────────────────────────────────────────────────────────────
+
+def _traces_json(path_data: dict, metric: str, show_legend: bool, unit: str = " ms") -> str:
+    traces = []
+    for path in PATH_ORDER:
+        if path not in path_data or metric not in path_data[path]:
+            continue
+        pts = path_data[path][metric]
+        # pts: list of (run_id, val, commit, ts)
+        trace = {
+            "x": [p[0] for p in pts],
+            "y": [p[1] for p in pts],
+            "customdata": [[p[2], p[3]] for p in pts],
+            "type": "scatter",
+            "mode": "lines+markers",
+            "name": PATH_LABELS.get(path, path),
+            "line": {"color": PATH_COLORS.get(path, "#aaa"), "width": 2},
+            "marker": {"size": 7, "symbol": "circle"},
+            "connectgaps": False,
+            "showlegend": show_legend,
+            "hovertemplate": (
+                f"<b>{PATH_LABELS.get(path, path)}</b><br>"
+                "%{customdata[1]}<br>"
+                f"%{{y:.1f}}{unit}<br>"
+                "<span style='color:#7e8385'>commit %{customdata[0]}</span>"
+                "<extra></extra>"
+            ),
+        }
+        traces.append(trace)
+    return json.dumps(traces)
+
+
+_CHART_LAYOUT = {
+    "plot_bgcolor":  "#0d1416",
+    "paper_bgcolor": "#141b1d",
+    "font":          {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9"},
+    "margin":        {"t": 16, "b": 48, "l": 52, "r": 12},
+    "height":        280,
+    "xaxis": {
+        # Categorical: one column per run, evenly spaced. Same-day runs
+        # used to collapse on a date axis; this keeps every run distinct.
+        "type":          "category",
+        "categoryorder": "array",  # categoryarray injected per chart
+        "color":         "#5b5f61",
+        "gridcolor":     "#1c2225",
+        "linecolor":     "#2d3335",
+        "tickfont":      {"size": 11, "family": "Geist Mono, monospace"},
+        "tickangle":     -30,
+        "automargin":    True,
+        "zeroline":      False,
+    },
+    "yaxis": {
+        "rangemode": "tozero",
+        "color":     "#5b5f61",
+        "gridcolor": "#1c2225",
+        "linecolor": "#2d3335",
+        "tickfont":  {"size": 11, "family": "Geist Mono, monospace"},
+        "ticksuffix": " ms",
+        "zeroline":  False,
+    },
+    "legend": {
+        "orientation": "h",
+        "y": -0.28,
+        "x": 0,
+        "font": {"size": 11, "color": "#a1a4a5"},
+        "bgcolor": "rgba(0,0,0,0)",
+    },
+    "hoverlabel": {
+        "bgcolor":    "#1c2225",
+        "bordercolor":"#2d3335",
+        "font":       {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"},
+    },
+}
+
+
+def _chart_card(div_id: str, model: str, traces_json: str, show_legend: bool,
+                run_ids: list[str], run_labels: list[str], unit: str = " ms") -> str:
+    layout = dict(_CHART_LAYOUT)
+    xaxis = {
+        **layout["xaxis"],
+        "categoryarray": run_ids,
+        "tickvals":      run_ids,
+        "ticktext":      run_labels,
+    }
+    layout = {**layout,
+              "xaxis": xaxis,
+              "yaxis": {**layout["yaxis"], "ticksuffix": unit}}
+    if not show_legend:
+        layout = {**layout, "legend": {**layout["legend"], "visible": False},
+                  "margin": {**layout["margin"], "b": 16}}
+    return f"""<div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">{model}</span>
+  </div>
+  <div id="{div_id}"></div>
+  <script>
+    Plotly.newPlot("{div_id}", {traces_json}, {json.dumps(layout)},
+      {{responsive: true, displayModeBar: false}});
+  </script>
+</div>"""
+
+
+def _sweep_3d_traces_json(model_data: dict, metric: str, run_ids: list[str], unit: str = " ms") -> str:
+    """One scatter3d trace per (path, run) — same colour per path, stacked by run on Y."""
+    traces = []
+    path_legend_shown: set[str] = set()
+
+    for run_id in run_ids:
+        for path in PATH_ORDER:
+            run_map = model_data.get(path, {}).get(metric, {})
+            if run_id not in run_map:
+                continue
+            entry = run_map[run_id]
+            pts = entry["points"]
+            label = entry["label"]
+            commit = entry["commit"]
+            color = PATH_COLORS.get(path, "#aaa")
+            show_legend = path not in path_legend_shown
+            path_legend_shown.add(path)
+
+            traces.append({
+                "type": "scatter3d",
+                "mode": "lines+markers",
+                "x": [p[0] for p in pts],   # search iters
+                "y": [label] * len(pts),     # run label (categorical)
+                "z": [p[1] for p in pts],    # value (already scaled by build_sweep_series)
+                "name": PATH_LABELS.get(path, path),
+                "legendgroup": path,
+                "showlegend": show_legend,
+                "line":   {"color": color, "width": 5},
+                "marker": {"color": color, "size": 4},
+                "hovertemplate": (
+                    f"<b>{PATH_LABELS.get(path, path)}</b><br>"
+                    f"s=%{{x}} iters<br>%{{z:.1f}}{unit}<br>"
+                    f"{label} · {commit}"
+                    "<extra></extra>"
+                ),
+            })
+
+    # Cross-run wire lines: for each path, connect same-budget points across
+    # runs. Makes regressions at a fixed search budget visible as a kink in the
+    # wireframe. Dashed + thinner than the per-run curves; legendgroup matches
+    # the path so toggling one toggles both.
+    for path in PATH_ORDER:
+        metric_runs = model_data.get(path, {}).get(metric, {})
+        if len(metric_runs) < 2:
+            continue
+        color = PATH_COLORS.get(path, "#aaa")
+        # by_budget[iters] -> list of (run_label, value) in chronological order
+        by_budget: dict = {}
+        for run_id in run_ids:
+            if run_id not in metric_runs:
+                continue
+            entry = metric_runs[run_id]
+            for iters, val in entry["points"]:
+                by_budget.setdefault(iters, []).append((entry["label"], val))
+        for budget, items in sorted(by_budget.items()):
+            if len(items) < 2:
+                continue
+            traces.append({
+                "type": "scatter3d",
+                "mode": "lines",
+                "x": [budget] * len(items),
+                "y": [it[0] for it in items],
+                "z": [it[1] for it in items],
+                "legendgroup": path,
+                "showlegend": False,
+                "line": {"color": color, "width": 2, "dash": "dash"},
+                "hovertemplate": (
+                    f"<b>{PATH_LABELS.get(path, path)} @ s={budget}</b><br>"
+                    f"%{{y}}: %{{z:.1f}}{unit}"
+                    "<extra></extra>"
+                ),
+            })
+    return json.dumps(traces)
+
+
+_SWEEP_3D_LAYOUT = {
+    "paper_bgcolor": "#141b1d",
+    "font": {"family": "Geist, system-ui, sans-serif", "color": "#d7d8d9", "size": 11},
+    "height": 420,
+    "margin": {"t": 20, "b": 0, "l": 0, "r": 0},
+    "legend": {
+        "orientation": "h",
+        "y": -0.05,
+        "x": 0,
+        "font": {"size": 11, "color": "#a1a4a5", "family": "Geist Mono, monospace"},
+        "bgcolor": "rgba(0,0,0,0)",
+    },
+    "hoverlabel": {
+        "bgcolor": "#1c2225",
+        "bordercolor": "#2d3335",
+        "font": {"size": 12, "color": "#d7d8d9", "family": "Geist Mono, monospace"},
+    },
+    "scene": {
+        "bgcolor": "#0d1416",
+        "xaxis": {
+            "title": {"text": "search iters", "font": {"size": 10, "color": "#7e8385"}},
+            "type": "log",
+            "tickvals": [5, 10, 20, 50, 100, 500],
+            "ticktext": ["5", "10", "20", "50", "100", "500"],
+            "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"},
+            "gridcolor": "#1c2225",
+            "linecolor": "#2d3335",
+            "zerolinecolor": "#2d3335",
+        },
+        "yaxis": {
+            "title": {"text": "run", "font": {"size": 10, "color": "#7e8385"}},
+            "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"},
+            "gridcolor": "#1c2225",
+            "linecolor": "#2d3335",
+        },
+        "zaxis": {
+            "title": {"text": "ms", "font": {"size": 10, "color": "#7e8385"}},
+            "rangemode": "tozero",
+            "tickfont": {"size": 10, "family": "Geist Mono, monospace", "color": "#5b5f61"},
+            "ticksuffix": " ms",
+            "gridcolor": "#1c2225",
+            "linecolor": "#2d3335",
+        },
+        "camera": {
+            "eye": {"x": 1.6, "y": -1.6, "z": 0.9},
+        },
+    },
+}
+
+
+def _sweep_3d_card(div_id: str, model: str, traces_json: str, unit: str = " ms") -> str:
+    layout = {**_SWEEP_3D_LAYOUT,
+              "scene": {**_SWEEP_3D_LAYOUT["scene"],
+                        "zaxis": {**_SWEEP_3D_LAYOUT["scene"]["zaxis"],
+                                  "title": {**_SWEEP_3D_LAYOUT["scene"]["zaxis"]["title"],
+                                            "text": unit.strip()},
+                                  "ticksuffix": unit}}}
+    return f"""<div class="chart-card">
+  <div class="chart-card-header">
+    <span class="model-tag">{model}</span>
+  </div>
+  <div id="{div_id}"></div>
+  <script>
+    Plotly.newPlot("{div_id}", {traces_json}, {json.dumps(layout)},
+      {{responsive: true, displayModeBar: true, displaylogo: false,
+        modeBarButtonsToRemove: ["toImage","sendDataToCloud"]}});
+  </script>
+</div>"""
+
+
+# ── HTML assembly ─────────────────────────────────────────────────────────────
+
+def build_html(runs: list[dict], data: dict,
+               run_ids: list[str], run_labels: list[str],
+               sweep_data: dict | None = None,
+               sweep_run_ids: list[str] | None = None) -> str:
+    # Preserve insertion order of models as seen across runs
+    models = list(dict.fromkeys(
+        r["config"]
+        for run in runs
+        for r in run["results"]
+        if not r.get("config", "").startswith("s=") and not r.get("error")
+    ))
+
+    last_ts = ""
+    if runs:
+        raw = runs[-1]["meta"]["timestamp"]
+        try:
+            last_ts = datetime.fromisoformat(raw).strftime("%b %d, %Y · %H:%M")
+        except ValueError:
+            last_ts = raw[:16].replace("T", " ")
+
+    n_runs = len(runs)
+
+    sections_html = ""
+    for metric_key, metric_label, ylabel, _scale, unit in METRICS:
+        active_models = [
+            m for m in models
+            if any(metric_key in data.get(m, {}).get(p, {}) for p in PATH_ORDER)
+        ]
+        if not active_models:
+            continue
+
+        cards_html = ""
+        first = True
+        for model in active_models:
+            path_data = data.get(model, {})
+            div_id = f"c_{metric_key}_{model.replace('-','_').replace('.','_')}"
+            traces = _traces_json(path_data, metric_key, show_legend=first, unit=unit)
+            cards_html += _chart_card(div_id, model, traces, show_legend=first,
+                                      run_ids=run_ids, run_labels=run_labels, unit=unit)
+            first = False
+
+        n = len(active_models)
+        # Clamp columns so charts don't get too narrow; wrap at 4
+        cols = min(n, 4)
+        sections_html += f"""
+<section>
+  <div class="section-header">
+    <span class="section-eyebrow">metric</span>
+    <h2 class="section-title">{metric_label} <span class="unit">over time</span></h2>
+    <span class="section-tag">{ylabel}</span>
+  </div>
+  <div class="chart-grid" style="grid-template-columns: repeat({cols}, 1fr)">
+{cards_html}
+  </div>
+</section>"""
+
+    # ── sweep sections (3D) ──────────────────────────────────────────────────
+    sweep_sections_html = ""
+    if sweep_data and sweep_run_ids:
+        sweep_models = list(sweep_data.keys())
+        for metric_key, metric_label, ylabel, _scale, unit in METRICS:
+            active = [
+                m for m in sweep_models
+                if any(
+                    run_id in sweep_data[m].get(p, {}).get(metric_key, {})
+                    for p in PATH_ORDER
+                    for run_id in sweep_run_ids
+                )
+            ]
+            if not active:
+                continue
+            cards_html = ""
+            for model in active:
+                div_id = f"sw_{metric_key}_{model.replace('-','_').replace('.','_')}"
+                traces = _sweep_3d_traces_json(sweep_data[model], metric_key, sweep_run_ids, unit=unit)
+                cards_html += _sweep_3d_card(div_id, model, traces, unit=unit)
+            cols = min(len(active), 4)
+            run_count = len(sweep_run_ids)
+            sweep_sections_html += f"""
+<section>
+  <div class="section-header">
+    <span class="section-eyebrow">sweep · 3d</span>
+    <h2 class="section-title">{metric_label} <span class="unit">vs search budget · over time</span></h2>
+    <span class="section-tag">{run_count} run{"s" if run_count != 1 else ""}</span>
+  </div>
+  <p class="sweep-hint">Drag to rotate · scroll to zoom · each curve = one run</p>
+  <div class="chart-grid" style="grid-template-columns: repeat({cols}, 1fr)">
+{cards_html}
+  </div>
+</section>"""
+
+    return f"""<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<title>Luminal · Benchmark Dashboard</title>
+<link rel="preconnect" href="https://fonts.googleapis.com">
+<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+<link href="https://fonts.googleapis.com/css2?family=Geist:wght@300;400;500;600&family=Geist+Mono:wght@300;400;500&display=swap" rel="stylesheet">
+<script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
+<style>
+*, *::before, *::after {{ box-sizing: border-box; margin: 0; padding: 0; }}
+html {{ -webkit-font-smoothing: antialiased; scroll-behavior: smooth; }}
+
+body {{
+  font-family: 'Geist', system-ui, sans-serif;
+  background: #030712;
+  color: #d7d8d9;
+  min-height: 100vh;
+  line-height: 1.5;
+}}
+
+/* ── NAV ── */
+nav {{
+  position: sticky;
+  top: 0;
+  z-index: 50;
+  height: 56px;
+  background: rgba(8, 15, 17, 0.92);
+  backdrop-filter: blur(8px);
+  -webkit-backdrop-filter: blur(8px);
+  border-bottom: 1px solid #2d3335;
+  display: flex;
+  align-items: center;
+  padding: 0 24px;
+  gap: 0;
+}}
+.nav-brand {{
+  display: flex;
+  align-items: center;
+  gap: 8px;
+  font-family: 'Geist Mono', monospace;
+  font-size: 14px;
+  font-weight: 500;
+  letter-spacing: 0.05em;
+  color: #2faa6e;
+  text-decoration: none;
+}}
+.nav-dot {{
+  width: 6px;
+  height: 6px;
+  background: #2faa6e;
+  border-radius: 50%;
+  flex-shrink: 0;
+  animation: pulse-glow 2s ease-in-out infinite;
+}}
+.nav-sep {{
+  color: #2d3335;
+  margin: 0 14px;
+  font-size: 18px;
+  font-weight: 300;
+}}
+.nav-page {{
+  font-family: 'Geist Mono', monospace;
+  font-size: 11px;
+  letter-spacing: 0.1em;
+  text-transform: uppercase;
+  color: #7e8385;
+}}
+
+@keyframes pulse-glow {{
+  0%, 100% {{ opacity: 1; }}
+  50%       {{ opacity: 0.35; }}
+}}
+
+/* ── MAIN ── */
+main {{
+  max-width: 1200px;
+  margin: 0 auto;
+  padding: 40px 24px 80px;
+}}
+
+/* ── PAGE HEADER ── */
+.page-header {{
+  margin-bottom: 40px;
+  padding-bottom: 32px;
+  border-bottom: 1px solid #1c2225;
+}}
+.page-eyebrow {{
+  font-family: 'Geist Mono', monospace;
+  font-size: 11px;
+  letter-spacing: 0.1em;
+  text-transform: uppercase;
+  color: #2faa6e;
+  margin-bottom: 10px;
+}}
+.page-title {{
+  font-size: 30px;
+  font-weight: 500;
+  letter-spacing: -0.025em;
+  color: #d7d8d9;
+  margin-bottom: 10px;
+}}
+.page-meta {{
+  font-size: 14px;
+  color: #7e8385;
+  display: flex;
+  align-items: center;
+  gap: 0;
+  flex-wrap: wrap;
+}}
+.meta-sep {{
+  font-family: 'Geist Mono', monospace;
+  color: #2d3335;
+  margin: 0 10px;
+}}
+.meta-val {{
+  font-family: 'Geist Mono', monospace;
+  font-size: 13px;
+  color: #5b5f61;
+}}
+
+/* ── LEGEND STRIP ── */
+.legend-strip {{
+  display: flex;
+  flex-wrap: wrap;
+  gap: 6px;
+  margin-bottom: 32px;
+}}
+.legend-pill {{
+  display: flex;
+  align-items: center;
+  gap: 6px;
+  font-family: 'Geist Mono', monospace;
+  font-size: 11px;
+  letter-spacing: 0.04em;
+  color: #a1a4a5;
+  background: #141b1d;
+  border: 1px solid #2d3335;
+  border-radius: 2px;
+  padding: 4px 10px;
+}}
+.legend-swatch {{
+  width: 8px;
+  height: 8px;
+  border-radius: 50%;
+  flex-shrink: 0;
+}}
+
+/* ── SECTIONS ── */
+section {{ margin-bottom: 48px; }}
+.section-header {{
+  display: flex;
+  align-items: baseline;
+  gap: 10px;
+  margin-bottom: 16px;
+  padding-bottom: 12px;
+  border-bottom: 1px solid #1c2225;
+  flex-wrap: wrap;
+}}
+.section-eyebrow {{
+  font-family: 'Geist Mono', monospace;
+  font-size: 11px;
+  letter-spacing: 0.1em;
+  text-transform: uppercase;
+  color: #404647;
+}}
+.section-title {{
+  font-size: 18px;
+  font-weight: 500;
+  color: #d7d8d9;
+  letter-spacing: -0.01em;
+}}
+.section-title .unit {{
+  color: #7e8385;
+  font-weight: 400;
+}}
+.section-tag {{
+  font-family: 'Geist Mono', monospace;
+  font-size: 11px;
+  letter-spacing: 0.04em;
+  text-transform: uppercase;
+  color: #2faa6e;
+  background: #162322;
+  border: 1px solid #1c372e;
+  padding: 2px 8px;
+  border-radius: 2px;
+  margin-left: auto;
+}}
+
+/* ── CHART GRID ── */
+.chart-grid {{
+  display: grid;
+  gap: 10px;
+}}
+.chart-card {{
+  background: #141b1d;
+  border: 1px solid #2d3335;
+  border-radius: 2px;
+  overflow: hidden;
+  transition: border-color 150ms;
+  min-width: 0;
+}}
+.chart-card:hover {{ border-color: #404647; }}
+.chart-card-header {{
+  padding: 10px 14px 0;
+  display: flex;
+  align-items: center;
+}}
+.model-tag {{
+  font-family: 'Geist Mono', monospace;
+  font-size: 11px;
+  letter-spacing: 0.06em;
+  text-transform: uppercase;
+  color: #7e8385;
+}}
+
+/* ── FOOTER ── */
+footer {{
+  max-width: 1200px;
+  margin: 0 auto;
+  padding: 20px 24px;
+  border-top: 1px solid #1c2225;
+  font-family: 'Geist Mono', monospace;
+  font-size: 11px;
+  letter-spacing: 0.04em;
+  color: #404647;
+  display: flex;
+  justify-content: space-between;
+  flex-wrap: wrap;
+  gap: 8px;
+}}
+
+.section-divider {{
+  border: none;
+  border-top: 1px solid #1c2225;
+  margin: 8px 0 40px;
+}}
+.sweep-hint {{
+  font-family: 'Geist Mono', monospace;
+  font-size: 11px;
+  letter-spacing: 0.04em;
+  color: #404647;
+  margin-bottom: 12px;
+}}
+
+@media (max-width: 768px) {{
+  .chart-grid {{ grid-template-columns: 1fr !important; }}
+  .page-title {{ font-size: 22px; }}
+}}
+</style>
+</head>
+<body>
+
+<nav>
+  <a class="nav-brand" href="https://luminal.com">
+    <span class="nav-dot"></span>luminal
+  </a>
+  <span class="nav-sep">/</span>
+  <span class="nav-page">benchmarks</span>
+</nav>
+
+<main>
+
+<header class="page-header">
+  <p class="page-eyebrow">performance · time-series</p>
+  <h1 class="page-title">Benchmark Dashboard</h1>
+  <div class="page-meta">
+    <span>Last updated</span>
+    <span class="meta-sep">·</span>
+    <span class="meta-val">{last_ts}</span>
+    <span class="meta-sep">·</span>
+    <span class="meta-val">{n_runs} run{"s" if n_runs != 1 else ""} in history</span>
+  </div>
+</header>
+
+<div class="legend-strip">
+  {"".join(
+      f'<div class="legend-pill"><span class="legend-swatch" style="background:{PATH_COLORS[p]}"></span>{PATH_LABELS[p]}</div>'
+      for p in PATH_ORDER
+  )}
+</div>
+
+{sections_html}
+{"<hr class='section-divider'>" + sweep_sections_html if sweep_sections_html else ""}
+
+</main>
+
+<footer>
+  <span>luminal · benchmark dashboard</span>
+  <span>generated {last_ts}</span>
+</footer>
+
+</body>
+</html>
+"""
+
+
+# ── entry point ───────────────────────────────────────────────────────────────
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--db", default=str(db.DEFAULT_DB_PATH),
+                    help=f"SQLite bench DB (default: {db.DEFAULT_DB_PATH})")
+    ap.add_argument("--out", default=str(BENCH_DIR / "dashboard.html"),
+                    help="Output HTML file")
+    args = ap.parse_args()
+
+    runs = load_history(Path(args.db))
+    if not runs:
+        print(f"No runs found in {args.db}. Run --ur-test (or backfill) first.")
+        return
+
+    data, run_ids, run_labels = build_series(runs)
+    sweep_data, sweep_run_ids = build_sweep_series(runs)
+    html = build_html(runs, data, run_ids, run_labels, sweep_data, sweep_run_ids)
+    Path(args.out).write_text(html)
+
+    print(f"wrote {args.out}  ({len(runs)} runs, {sum(len(v) for v in data.values())} model×path series)")
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks/ttft/gen_report.py
+++ b/benchmarks/ttft/gen_report.py
@@ -0,0 +1,349 @@
+#!/usr/bin/env python3
+"""Generate a standalone HTML benchmark report from a single benchmark run.
+
+Usage:
+    python3 gen_report.py [--db PATH] [--run RUN_ID] [--out report.html] [--title "..."]
+
+Sections are split out of a single run automatically:
+  - per-model_key, "comparison" (configs not matching s=N)  →  grouped bar chart
+  - per-model_key, "sweep" (configs matching s=N)           →  line chart (log X)
+For runs without model_key (e.g. single-config runs), one section per detected
+shape is produced instead.
+"""
+
+import argparse
+import json
+import re
+import sys
+from pathlib import Path
+
+import db
+
+PATH_ORDER = ["python_baseline", "python_torch_compile", "python_luminal", "rust"]
+PATH_LABELS = {
+    "python_baseline":      "HF Baseline",
+    "python_torch_compile": "torch.compile",
+    "python_luminal":       "luminal backend",
+    "rust":                 "Rust (luminal)",
+}
+PATH_COLORS = {
+    "python_baseline":      "#888888",
+    "python_torch_compile": "#5ab552",
+    "python_luminal":       "#4c9ed9",
+    "rust":                 "#d97a4c",
+}
+
+
+# ── helpers ──────────────────────────────────────────────────────────────────
+
+def _fmt(v, decimals=1, suffix=""):
+    return f"{v:.{decimals}f}{suffix}" if v is not None else "—"
+
+def _section_title(path: Path) -> str:
+    stem = path.stem.replace("_", " ").replace("-", " ")
+    return stem.title()
+
+def _is_sweep(configs: list[str]) -> bool:
+    return bool(configs) and all(re.fullmatch(r"s=\d+", c) for c in configs)
+
+def _group_by_config(results: list[dict]) -> dict[str, dict[str, dict]]:
+    """Return {config: {path: result_dict}}."""
+    out: dict[str, dict[str, dict]] = {}
+    for r in results:
+        cfg = r.get("config", "default")
+        out.setdefault(cfg, {})[r["path"]] = r
+    return out
+
+
+# ── chart builders (return Plotly figure dicts) ───────────────────────────────
+
+def _bar_figure(by_config: dict, metric: str, title: str,
+                scale: float = 1.0, unit: str = "ms") -> dict:
+    configs = list(by_config.keys())
+    traces = []
+    for path in PATH_ORDER:
+        ys, texts = [], []
+        for cfg in configs:
+            r = by_config[cfg].get(path)
+            raw = r.get(metric) if r and not r.get("error") else None
+            v = raw * scale if raw is not None else None
+            ys.append(v if v is not None else 0)
+            texts.append(f"{v:.1f} {unit}" if v is not None else "n/a")
+        if any(y > 0 for y in ys):
+            traces.append({
+                "type": "bar",
+                "name": PATH_LABELS.get(path, path),
+                "x": configs,
+                "y": ys,
+                "text": texts,
+                "textposition": "outside",
+                "marker": {"color": PATH_COLORS.get(path, "#aaaaaa")},
+                "hovertemplate": "%{x}<br>" + PATH_LABELS.get(path, path)
+                                 + f": %{{y:.1f}} {unit}<extra></extra>",
+            })
+    return {
+        "data": traces,
+        "layout": {
+            "title": title,
+            "yaxis": {"title": unit, "rangemode": "tozero"},
+            "barmode": "group",
+            "legend": {"orientation": "h", "y": -0.2},
+            "margin": {"t": 50, "b": 80},
+            "plot_bgcolor": "#fafafa",
+            "paper_bgcolor": "#ffffff",
+        },
+    }
+
+
+def _line_figure(by_config: dict, metric: str, title: str,
+                 scale: float = 1.0, unit: str = "ms") -> dict:
+    """Line chart for sweep data. Config names are 's=N'; X = N (log scale)."""
+    def _iter(cfg):
+        m = re.fullmatch(r"s=(\d+)", cfg)
+        return int(m.group(1)) if m else 0
+
+    configs_sorted = sorted(by_config.keys(), key=_iter)
+    xs = [_iter(c) for c in configs_sorted]
+
+    paths_present = {p for cfg in by_config.values() for p in cfg}
+    traces = []
+    for path in PATH_ORDER:
+        if path not in paths_present:
+            continue
+        ys = []
+        for cfg in configs_sorted:
+            r = by_config[cfg].get(path)
+            raw = r.get(metric) if r and not r.get("error") else None
+            ys.append(raw * scale if raw is not None else None)
+        if any(y is not None for y in ys):
+            traces.append({
+                "type": "scatter",
+                "mode": "lines+markers",
+                "name": PATH_LABELS.get(path, path),
+                "x": xs,
+                "y": ys,
+                "marker": {"size": 8, "color": PATH_COLORS.get(path, "#aaaaaa")},
+                "line": {"color": PATH_COLORS.get(path, "#aaaaaa"), "width": 2},
+                "hovertemplate": "iters=%{x}<br>" + PATH_LABELS.get(path, path)
+                                 + f": %{{y:.1f}} {unit}<extra></extra>",
+            })
+    return {
+        "data": traces,
+        "layout": {
+            "title": title,
+            "xaxis": {"title": "Search iterations", "type": "log",
+                      "tickvals": xs, "ticktext": [str(x) for x in xs]},
+            "yaxis": {"title": unit, "rangemode": "tozero"},
+            "legend": {"orientation": "h", "y": -0.25},
+            "margin": {"t": 50, "b": 90},
+            "plot_bgcolor": "#fafafa",
+            "paper_bgcolor": "#ffffff",
+        },
+    }
+
+
+# ── table builder ─────────────────────────────────────────────────────────────
+
+def _table_html(results: list[dict]) -> str:
+    rows = []
+    for r in sorted(results, key=lambda r: (r.get("config", ""), PATH_ORDER.index(r["path"]) if r["path"] in PATH_ORDER else 99)):
+        error = r.get("error")
+        style = ' style="background:#fff0f0"' if error else ""
+        path_label = PATH_LABELS.get(r["path"], r["path"])
+        cfg = r.get("config", "—")
+        ttft = _fmt(r.get("ttft_ms"), 1, " ms")
+        tpot = _fmt(r.get("tpot_ms"), 1, " ms")
+        tput = _fmt(r.get("throughput_tps"), 1, " tok/s")
+        comp = _fmt(r.get("compile_ms"), 0, " ms") if r.get("compile_ms") else "—"
+        ptok = str(r.get("prompt_tokens", "—"))
+        note = (r.get("error") or r.get("note") or "")[:90]
+        note_style = ' style="color:#c00"' if error else ' style="color:#777"'
+        rows.append(
+            f'<tr{style}>'
+            f'<td>{path_label}</td><td>{cfg}</td>'
+            f'<td>{ttft}</td><td>{tpot}</td><td>{tput}</td>'
+            f'<td>{comp}</td><td>{ptok}</td>'
+            f'<td{note_style}>{note}</td>'
+            f'</tr>'
+        )
+    return (
+        '<table>'
+        '<thead><tr>'
+        '<th>Path</th><th>Config</th>'
+        '<th>TTFT</th><th>TPOT</th><th>Throughput</th>'
+        '<th>Compile</th><th>Prompt tokens</th><th>Note</th>'
+        '</tr></thead>'
+        '<tbody>' + "\n".join(rows) + '</tbody>'
+        '</table>'
+    )
+
+
+# ── section builder ───────────────────────────────────────────────────────────
+
+def _section_html(sec_id: str, title: str, results: list[dict], fig_counter: list) -> str:
+    by_config = _group_by_config(results)
+    configs = list(by_config.keys())
+    sweep = _is_sweep(configs)
+
+    models = list(dict.fromkeys(r.get("model", "") for r in results if r.get("model")))
+    model_str = ", ".join(models) if models else "—"
+    prompt_tokens = list(dict.fromkeys(r.get("prompt_tokens") for r in results if r.get("prompt_tokens")))
+    tok_str = "/".join(str(t) for t in prompt_tokens) + " prompt tokens" if prompt_tokens else ""
+
+    builder = _line_figure if sweep else _bar_figure
+    ttft_fig = builder(by_config, "ttft_ms", "TTFT")
+    has_tpot = any(r.get("tpot_ms") is not None for r in results if not r.get("error"))
+    tpot_fig = builder(by_config, "tpot_ms", "TPOT") if has_tpot else None
+    has_compile = any(r.get("compile_ms") is not None and r.get("compile_ms") > 0
+                      for r in results if not r.get("error"))
+    compile_fig = (builder(by_config, "compile_ms", "Time to Search",
+                           scale=0.001, unit="sec")
+                   if has_compile else None)
+
+    def chart_div(fig):
+        n = fig_counter[0]
+        fig_counter[0] += 1
+        return (
+            f'<div id="fig{n}" class="chart"></div>'
+            f'<script>Plotly.newPlot("fig{n}", {json.dumps(fig["data"])}, {json.dumps(fig["layout"])}, {{responsive:true}});</script>'
+        )
+
+    charts_html = f'<div class="charts-row">{chart_div(ttft_fig)}'
+    if tpot_fig:
+        charts_html += chart_div(tpot_fig)
+    if compile_fig:
+        charts_html += chart_div(compile_fig)
+    charts_html += '</div>'
+
+    return f"""
+<section id="{sec_id}">
+  <h2>{title}</h2>
+  <p class="meta">{model_str}{" · " + tok_str if tok_str else ""} · {len(results)} results</p>
+  {charts_html}
+  {_table_html(results)}
+</section>
+"""
+
+
+# ── full page ─────────────────────────────────────────────────────────────────
+
+CSS = """
+* { box-sizing: border-box; margin: 0; padding: 0; }
+body { font-family: system-ui, sans-serif; background: #f0f2f5; color: #222; }
+header { background: #1a1a2e; color: #fff; padding: 1rem 2rem;
+         position: sticky; top: 0; z-index: 100; display: flex;
+         align-items: center; gap: 2rem; }
+header h1 { font-size: 1.2rem; white-space: nowrap; }
+nav a { color: #a0c4ff; text-decoration: none; font-size: 0.9rem;
+        padding: 0.3rem 0.7rem; border-radius: 4px; white-space: nowrap; }
+nav a:hover { background: rgba(255,255,255,0.15); }
+main { max-width: 1400px; margin: 0 auto; padding: 2rem; display: flex;
+       flex-direction: column; gap: 2.5rem; }
+section { background: #fff; border-radius: 8px; padding: 1.5rem 2rem;
+          box-shadow: 0 1px 4px rgba(0,0,0,.08); }
+h2 { font-size: 1.3rem; margin-bottom: 0.4rem; }
+.meta { color: #666; font-size: 0.85rem; margin-bottom: 1.2rem; }
+.charts-row { display: flex; gap: 1.5rem; flex-wrap: wrap; margin-bottom: 1.5rem; }
+.chart { flex: 1; min-width: 340px; height: 360px; }
+table { width: 100%; border-collapse: collapse; font-size: 0.82rem; }
+thead tr { background: #f5f5f5; }
+th, td { padding: 0.45rem 0.7rem; text-align: left;
+         border-bottom: 1px solid #e8e8e8; }
+th { font-weight: 600; white-space: nowrap; }
+tr:last-child td { border-bottom: none; }
+tr:hover { background: #fafafa; }
+"""
+
+def _build_html(sections: list[tuple[str, str, list[dict]]], title: str) -> str:
+    nav_links = "".join(f'<a href="#{sid}">{stitle}</a>' for sid, stitle, _ in sections)
+    fig_counter = [0]
+    body = "".join(_section_html(sid, stitle, results, fig_counter)
+                   for sid, stitle, results in sections)
+    return f"""<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<title>{title}</title>
+<script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
+<style>{CSS}</style>
+</head>
+<body>
+<header>
+  <h1>{title}</h1>
+  <nav>{nav_links}</nav>
+</header>
+<main>{body}</main>
+</body>
+</html>"""
+
+
+# ── CLI ───────────────────────────────────────────────────────────────────────
+
+def _sections_for_run(results: list[dict]) -> list[tuple[str, str, list[dict]]]:
+    """Split a single run's results into (sec_id, title, records) sections.
+
+    Splits first by model_key (NULL → 'results'), then within each by
+    sweep-vs-comparison based on config 's=N' shape."""
+    by_key: dict[str | None, list[dict]] = {}
+    for r in results:
+        by_key.setdefault(r.get("model_key"), []).append(r)
+
+    sections: list[tuple[str, str, list[dict]]] = []
+    for key, recs in by_key.items():
+        comp, sweep = [], []
+        for r in recs:
+            (sweep if str(r.get("config", "")).startswith("s=") else comp).append(r)
+        prefix = (key or "results").replace("-", "_").replace(".", "_")
+        title_prefix = key or "Results"
+        if comp:
+            sections.append((f"{prefix}_comparison",
+                             f"{title_prefix} comparison".strip().title(),
+                             comp))
+        if sweep:
+            sections.append((f"{prefix}_sweep",
+                             f"{title_prefix} sweep".strip().title(),
+                             sweep))
+    return sections
+
+
+def main():
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("--db", default=str(db.DEFAULT_DB_PATH),
+                    help=f"SQLite bench DB (default: {db.DEFAULT_DB_PATH})")
+    ap.add_argument("--run", default=None,
+                    help="Run ID to render (default: latest run in DB)")
+    ap.add_argument("--out", default=None,
+                    help="Output HTML path (default: report.html in benchmarks/ttft/)")
+    ap.add_argument("--title", default="Luminal TTFT Benchmark Report",
+                    help="Page title and heading")
+    args = ap.parse_args()
+
+    if not Path(args.db).exists():
+        print(f"DB not found: {args.db}", file=sys.stderr)
+        sys.exit(1)
+
+    conn = db.connect(args.db)
+    run_id = args.run or db.latest_run_id(conn)
+    if run_id is None:
+        print(f"No runs in {args.db}", file=sys.stderr)
+        sys.exit(1)
+
+    results = db.load_results(conn, run_id)
+    if not results:
+        print(f"No results for run {run_id}", file=sys.stderr)
+        sys.exit(1)
+
+    sections = _sections_for_run(results)
+    if not sections:
+        print(f"No section data for run {run_id}", file=sys.stderr)
+        sys.exit(1)
+
+    out = Path(args.out) if args.out else Path(__file__).parent / "report.html"
+    html = _build_html(sections, f"{args.title} — {run_id}")
+    out.write_text(html)
+    print(f"wrote {out}  (run {run_id}, {len(sections)} sections, {len(results)} results)")
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks/ttft/report.html
+++ b/benchmarks/ttft/report.html
@@ -0,0 +1,148 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<title>Luminal TTFT Benchmark Report — 2026-05-01T18-56-26-996695</title>
+<script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
+<style>
+* { box-sizing: border-box; margin: 0; padding: 0; }
+body { font-family: system-ui, sans-serif; background: #f0f2f5; color: #222; }
+header { background: #1a1a2e; color: #fff; padding: 1rem 2rem;
+         position: sticky; top: 0; z-index: 100; display: flex;
+         align-items: center; gap: 2rem; }
+header h1 { font-size: 1.2rem; white-space: nowrap; }
+nav a { color: #a0c4ff; text-decoration: none; font-size: 0.9rem;
+        padding: 0.3rem 0.7rem; border-radius: 4px; white-space: nowrap; }
+nav a:hover { background: rgba(255,255,255,0.15); }
+main { max-width: 1400px; margin: 0 auto; padding: 2rem; display: flex;
+       flex-direction: column; gap: 2.5rem; }
+section { background: #fff; border-radius: 8px; padding: 1.5rem 2rem;
+          box-shadow: 0 1px 4px rgba(0,0,0,.08); }
+h2 { font-size: 1.3rem; margin-bottom: 0.4rem; }
+.meta { color: #666; font-size: 0.85rem; margin-bottom: 1.2rem; }
+.charts-row { display: flex; gap: 1.5rem; flex-wrap: wrap; margin-bottom: 1.5rem; }
+.chart { flex: 1; min-width: 340px; height: 360px; }
+table { width: 100%; border-collapse: collapse; font-size: 0.82rem; }
+thead tr { background: #f5f5f5; }
+th, td { padding: 0.45rem 0.7rem; text-align: left;
+         border-bottom: 1px solid #e8e8e8; }
+th { font-weight: 600; white-space: nowrap; }
+tr:last-child td { border-bottom: none; }
+tr:hover { background: #fafafa; }
+</style>
+</head>
+<body>
+<header>
+  <h1>Luminal TTFT Benchmark Report — 2026-05-01T18-56-26-996695</h1>
+  <nav><a href="#llama_8b_comparison">Llama-8B Comparison</a><a href="#llama_8b_sweep">Llama-8B Sweep</a><a href="#qwen3_4b_comparison">Qwen3-4B Comparison</a><a href="#qwen3_4b_sweep">Qwen3-4B Sweep</a><a href="#gemma3_4b_comparison">Gemma3-4B Comparison</a><a href="#gemma3_4b_sweep">Gemma3-4B Sweep</a><a href="#gemma4_moe_comparison">Gemma4-Moe Comparison</a><a href="#gemma4_moe_sweep">Gemma4-Moe Sweep</a><a href="#qwen3_moe_comparison">Qwen3-Moe Comparison</a><a href="#qwen3_moe_sweep">Qwen3-Moe Sweep</a></nav>
+</header>
+<main>
+<section id="llama_8b_comparison">
+  <h2>Llama-8B Comparison</h2>
+  <p class="meta">NousResearch/Meta-Llama-3-8B-Instruct · 21 prompt tokens · 4 results</p>
+  <div class="charts-row"><div id="fig0" class="chart"></div><script>Plotly.newPlot("fig0", [{"type": "bar", "name": "HF Baseline", "x": ["llama-8b"], "y": [705.9654394979589], "text": ["706.0 ms"], "textposition": "outside", "marker": {"color": "#888888"}, "hovertemplate": "%{x}<br>HF Baseline: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "torch.compile", "x": ["llama-8b"], "y": [307.66548847896047], "text": ["307.7 ms"], "textposition": "outside", "marker": {"color": "#5ab552"}, "hovertemplate": "%{x}<br>torch.compile: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "luminal backend", "x": ["llama-8b"], "y": [461.48114453535527], "text": ["461.5 ms"], "textposition": "outside", "marker": {"color": "#4c9ed9"}, "hovertemplate": "%{x}<br>luminal backend: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "Rust (luminal)", "x": ["llama-8b"], "y": [1026.86], "text": ["1026.9 ms"], "textposition": "outside", "marker": {"color": "#d97a4c"}, "hovertemplate": "%{x}<br>Rust (luminal): %{y:.1f} ms<extra></extra>"}], {"title": "TTFT", "yaxis": {"title": "ms", "rangemode": "tozero"}, "barmode": "group", "legend": {"orientation": "h", "y": -0.2}, "margin": {"t": 50, "b": 80}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script><div id="fig1" class="chart"></div><script>Plotly.newPlot("fig1", [{"type": "bar", "name": "HF Baseline", "x": ["llama-8b"], "y": [34.15271903970279], "text": ["34.2 ms"], "textposition": "outside", "marker": {"color": "#888888"}, "hovertemplate": "%{x}<br>HF Baseline: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "torch.compile", "x": ["llama-8b"], "y": [171.7862353892997], "text": ["171.8 ms"], "textposition": "outside", "marker": {"color": "#5ab552"}, "hovertemplate": "%{x}<br>torch.compile: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "luminal backend", "x": ["llama-8b"], "y": [23.078908618772402], "text": ["23.1 ms"], "textposition": "outside", "marker": {"color": "#4c9ed9"}, "hovertemplate": "%{x}<br>luminal backend: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "Rust (luminal)", "x": ["llama-8b"], "y": [51.64], "text": ["51.6 ms"], "textposition": "outside", "marker": {"color": "#d97a4c"}, "hovertemplate": "%{x}<br>Rust (luminal): %{y:.1f} ms<extra></extra>"}], {"title": "TPOT", "yaxis": {"title": "ms", "rangemode": "tozero"}, "barmode": "group", "legend": {"orientation": "h", "y": -0.2}, "margin": {"t": 50, "b": 80}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script><div id="fig2" class="chart"></div><script>Plotly.newPlot("fig2", [{"type": "bar", "name": "torch.compile", "x": ["llama-8b"], "y": [18.760145067994017], "text": ["18.8 sec"], "textposition": "outside", "marker": {"color": "#5ab552"}, "hovertemplate": "%{x}<br>torch.compile: %{y:.1f} sec<extra></extra>"}, {"type": "bar", "name": "luminal backend", "x": ["llama-8b"], "y": [95.96263545705006], "text": ["96.0 sec"], "textposition": "outside", "marker": {"color": "#4c9ed9"}, "hovertemplate": "%{x}<br>luminal backend: %{y:.1f} sec<extra></extra>"}, {"type": "bar", "name": "Rust (luminal)", "x": ["llama-8b"], "y": [84.45343], "text": ["84.5 sec"], "textposition": "outside", "marker": {"color": "#d97a4c"}, "hovertemplate": "%{x}<br>Rust (luminal): %{y:.1f} sec<extra></extra>"}], {"title": "Time to Search", "yaxis": {"title": "sec", "rangemode": "tozero"}, "barmode": "group", "legend": {"orientation": "h", "y": -0.2}, "margin": {"t": 50, "b": 80}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script></div>
+  <table><thead><tr><th>Path</th><th>Config</th><th>TTFT</th><th>TPOT</th><th>Throughput</th><th>Compile</th><th>Prompt tokens</th><th>Note</th></tr></thead><tbody><tr><td>HF Baseline</td><td>llama-8b</td><td>706.0 ms</td><td>34.2 ms</td><td>29.3 tok/s</td><td>—</td><td>21</td><td style="color:#777">sequential per-token, StaticCache KV cache</td></tr>
+<tr><td>torch.compile</td><td>llama-8b</td><td>307.7 ms</td><td>171.8 ms</td><td>5.8 tok/s</td><td>18760 ms</td><td>21</td><td style="color:#777">sequential per-token, StaticCache KV cache (torch.compile inductor)</td></tr>
+<tr><td>luminal backend</td><td>llama-8b</td><td>461.5 ms</td><td>23.1 ms</td><td>43.3 tok/s</td><td>95963 ms</td><td>21</td><td style="color:#777">sequential per-token, StaticCache KV cache</td></tr>
+<tr><td>Rust (luminal)</td><td>llama-8b</td><td>1026.9 ms</td><td>51.6 ms</td><td>19.4 tok/s</td><td>84453 ms</td><td>21</td><td style="color:#777">sum of per-token prefill durations</td></tr></tbody></table>
+</section>
+
+<section id="llama_8b_sweep">
+  <h2>Llama-8B Sweep</h2>
+  <p class="meta">NousResearch/Meta-Llama-3-8B-Instruct · 21 prompt tokens · 6 results</p>
+  <div class="charts-row"><div id="fig3" class="chart"></div><script>Plotly.newPlot("fig3", [{"type": "scatter", "mode": "lines+markers", "name": "luminal backend", "x": [10, 100, 500], "y": [470.7036415056791, 460.72837291285396, 472.43661794345826], "marker": {"size": 8, "color": "#4c9ed9"}, "line": {"color": "#4c9ed9", "width": 2}, "hovertemplate": "iters=%{x}<br>luminal backend: %{y:.1f} ms<extra></extra>"}, {"type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "x": [10, 100, 500], "y": [751.03, 1038.34, 453.16], "marker": {"size": 8, "color": "#d97a4c"}, "line": {"color": "#d97a4c", "width": 2}, "hovertemplate": "iters=%{x}<br>Rust (luminal): %{y:.1f} ms<extra></extra>"}], {"title": "TTFT", "xaxis": {"title": "Search iterations", "type": "log", "tickvals": [10, 100, 500], "ticktext": ["10", "100", "500"]}, "yaxis": {"title": "ms", "rangemode": "tozero"}, "legend": {"orientation": "h", "y": -0.25}, "margin": {"t": 50, "b": 90}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script><div id="fig4" class="chart"></div><script>Plotly.newPlot("fig4", [{"type": "scatter", "mode": "lines+markers", "name": "luminal backend", "x": [10, 100, 500], "y": [23.540849717101082, 23.101884137140587, 23.610779400914907], "marker": {"size": 8, "color": "#4c9ed9"}, "line": {"color": "#4c9ed9", "width": 2}, "hovertemplate": "iters=%{x}<br>luminal backend: %{y:.1f} ms<extra></extra>"}, {"type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "x": [10, 100, 500], "y": [38.2, 51.92, 24.09], "marker": {"size": 8, "color": "#d97a4c"}, "line": {"color": "#d97a4c", "width": 2}, "hovertemplate": "iters=%{x}<br>Rust (luminal): %{y:.1f} ms<extra></extra>"}], {"title": "TPOT", "xaxis": {"title": "Search iterations", "type": "log", "tickvals": [10, 100, 500], "ticktext": ["10", "100", "500"]}, "yaxis": {"title": "ms", "rangemode": "tozero"}, "legend": {"orientation": "h", "y": -0.25}, "margin": {"t": 50, "b": 90}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script><div id="fig5" class="chart"></div><script>Plotly.newPlot("fig5", [{"type": "scatter", "mode": "lines+markers", "name": "luminal backend", "x": [10, 100, 500], "y": [28.428826077957638, 43.57440591201885, 95.52432684396626], "marker": {"size": 8, "color": "#4c9ed9"}, "line": {"color": "#4c9ed9", "width": 2}, "hovertemplate": "iters=%{x}<br>luminal backend: %{y:.1f} sec<extra></extra>"}, {"type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "x": [10, 100, 500], "y": [15.14307, 30.12727, 84.87889], "marker": {"size": 8, "color": "#d97a4c"}, "line": {"color": "#d97a4c", "width": 2}, "hovertemplate": "iters=%{x}<br>Rust (luminal): %{y:.1f} sec<extra></extra>"}], {"title": "Time to Search", "xaxis": {"title": "Search iterations", "type": "log", "tickvals": [10, 100, 500], "ticktext": ["10", "100", "500"]}, "yaxis": {"title": "sec", "rangemode": "tozero"}, "legend": {"orientation": "h", "y": -0.25}, "margin": {"t": 50, "b": 90}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script></div>
+  <table><thead><tr><th>Path</th><th>Config</th><th>TTFT</th><th>TPOT</th><th>Throughput</th><th>Compile</th><th>Prompt tokens</th><th>Note</th></tr></thead><tbody><tr><td>luminal backend</td><td>s=10</td><td>470.7 ms</td><td>23.5 ms</td><td>42.5 tok/s</td><td>28429 ms</td><td>21</td><td style="color:#777">sequential per-token, StaticCache KV cache</td></tr>
+<tr><td>Rust (luminal)</td><td>s=10</td><td>751.0 ms</td><td>38.2 ms</td><td>26.2 tok/s</td><td>15143 ms</td><td>21</td><td style="color:#777">sum of per-token prefill durations</td></tr>
+<tr><td>luminal backend</td><td>s=100</td><td>460.7 ms</td><td>23.1 ms</td><td>43.3 tok/s</td><td>43574 ms</td><td>21</td><td style="color:#777">sequential per-token, StaticCache KV cache</td></tr>
+<tr><td>Rust (luminal)</td><td>s=100</td><td>1038.3 ms</td><td>51.9 ms</td><td>19.3 tok/s</td><td>30127 ms</td><td>21</td><td style="color:#777">sum of per-token prefill durations</td></tr>
+<tr><td>luminal backend</td><td>s=500</td><td>472.4 ms</td><td>23.6 ms</td><td>42.4 tok/s</td><td>95524 ms</td><td>21</td><td style="color:#777">sequential per-token, StaticCache KV cache</td></tr>
+<tr><td>Rust (luminal)</td><td>s=500</td><td>453.2 ms</td><td>24.1 ms</td><td>41.5 tok/s</td><td>84879 ms</td><td>21</td><td style="color:#777">sum of per-token prefill durations</td></tr></tbody></table>
+</section>
+
+<section id="qwen3_4b_comparison">
+  <h2>Qwen3-4B Comparison</h2>
+  <p class="meta">Qwen/Qwen3-4B · 19/11 prompt tokens · 4 results</p>
+  <div class="charts-row"><div id="fig6" class="chart"></div><script>Plotly.newPlot("fig6", [{"type": "bar", "name": "HF Baseline", "x": ["qwen3-4b"], "y": [869.2860195587855], "text": ["869.3 ms"], "textposition": "outside", "marker": {"color": "#888888"}, "hovertemplate": "%{x}<br>HF Baseline: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "torch.compile", "x": ["qwen3-4b"], "y": [298.27259748708457], "text": ["298.3 ms"], "textposition": "outside", "marker": {"color": "#5ab552"}, "hovertemplate": "%{x}<br>torch.compile: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "luminal backend", "x": ["qwen3-4b"], "y": [485.3892414830625], "text": ["485.4 ms"], "textposition": "outside", "marker": {"color": "#4c9ed9"}, "hovertemplate": "%{x}<br>luminal backend: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "Rust (luminal)", "x": ["qwen3-4b"], "y": [398.58], "text": ["398.6 ms"], "textposition": "outside", "marker": {"color": "#d97a4c"}, "hovertemplate": "%{x}<br>Rust (luminal): %{y:.1f} ms<extra></extra>"}], {"title": "TTFT", "yaxis": {"title": "ms", "rangemode": "tozero"}, "barmode": "group", "legend": {"orientation": "h", "y": -0.2}, "margin": {"t": 50, "b": 80}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script><div id="fig7" class="chart"></div><script>Plotly.newPlot("fig7", [{"type": "bar", "name": "HF Baseline", "x": ["qwen3-4b"], "y": [47.71483448566869], "text": ["47.7 ms"], "textposition": "outside", "marker": {"color": "#888888"}, "hovertemplate": "%{x}<br>HF Baseline: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "torch.compile", "x": ["qwen3-4b"], "y": [468.56868775503244], "text": ["468.6 ms"], "textposition": "outside", "marker": {"color": "#5ab552"}, "hovertemplate": "%{x}<br>torch.compile: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "luminal backend", "x": ["qwen3-4b"], "y": [26.90318431414198], "text": ["26.9 ms"], "textposition": "outside", "marker": {"color": "#4c9ed9"}, "hovertemplate": "%{x}<br>luminal backend: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "Rust (luminal)", "x": ["qwen3-4b"], "y": [40.62], "text": ["40.6 ms"], "textposition": "outside", "marker": {"color": "#d97a4c"}, "hovertemplate": "%{x}<br>Rust (luminal): %{y:.1f} ms<extra></extra>"}], {"title": "TPOT", "yaxis": {"title": "ms", "rangemode": "tozero"}, "barmode": "group", "legend": {"orientation": "h", "y": -0.2}, "margin": {"t": 50, "b": 80}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script><div id="fig8" class="chart"></div><script>Plotly.newPlot("fig8", [{"type": "bar", "name": "torch.compile", "x": ["qwen3-4b"], "y": [4.680963660997804], "text": ["4.7 sec"], "textposition": "outside", "marker": {"color": "#5ab552"}, "hovertemplate": "%{x}<br>torch.compile: %{y:.1f} sec<extra></extra>"}, {"type": "bar", "name": "luminal backend", "x": ["qwen3-4b"], "y": [45.345814052037895], "text": ["45.3 sec"], "textposition": "outside", "marker": {"color": "#4c9ed9"}, "hovertemplate": "%{x}<br>luminal backend: %{y:.1f} sec<extra></extra>"}, {"type": "bar", "name": "Rust (luminal)", "x": ["qwen3-4b"], "y": [19.92977], "text": ["19.9 sec"], "textposition": "outside", "marker": {"color": "#d97a4c"}, "hovertemplate": "%{x}<br>Rust (luminal): %{y:.1f} sec<extra></extra>"}], {"title": "Time to Search", "yaxis": {"title": "sec", "rangemode": "tozero"}, "barmode": "group", "legend": {"orientation": "h", "y": -0.2}, "margin": {"t": 50, "b": 80}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script></div>
+  <table><thead><tr><th>Path</th><th>Config</th><th>TTFT</th><th>TPOT</th><th>Throughput</th><th>Compile</th><th>Prompt tokens</th><th>Note</th></tr></thead><tbody><tr><td>HF Baseline</td><td>qwen3-4b</td><td>869.3 ms</td><td>47.7 ms</td><td>21.0 tok/s</td><td>—</td><td>19</td><td style="color:#777">sequential per-token, StaticCache KV cache</td></tr>
+<tr><td>torch.compile</td><td>qwen3-4b</td><td>298.3 ms</td><td>468.6 ms</td><td>2.1 tok/s</td><td>4681 ms</td><td>19</td><td style="color:#777">sequential per-token, StaticCache KV cache (torch.compile inductor)</td></tr>
+<tr><td>luminal backend</td><td>qwen3-4b</td><td>485.4 ms</td><td>26.9 ms</td><td>37.2 tok/s</td><td>45346 ms</td><td>19</td><td style="color:#777">sequential per-token, StaticCache KV cache</td></tr>
+<tr><td>Rust (luminal)</td><td>qwen3-4b</td><td>398.6 ms</td><td>40.6 ms</td><td>24.6 tok/s</td><td>19930 ms</td><td>11</td><td style="color:#777">sum of per-token prefill durations</td></tr></tbody></table>
+</section>
+
+<section id="qwen3_4b_sweep">
+  <h2>Qwen3-4B Sweep</h2>
+  <p class="meta">Qwen/Qwen3-4B · 19/11 prompt tokens · 6 results</p>
+  <div class="charts-row"><div id="fig9" class="chart"></div><script>Plotly.newPlot("fig9", [{"type": "scatter", "mode": "lines+markers", "name": "luminal backend", "x": [10, 100, 500], "y": [465.02652901108377, 465.9317950136028, 495.75577257201076], "marker": {"size": 8, "color": "#4c9ed9"}, "line": {"color": "#4c9ed9", "width": 2}, "hovertemplate": "iters=%{x}<br>luminal backend: %{y:.1f} ms<extra></extra>"}, {"type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "x": [10, 100, 500], "y": [398.44, 390.08, 559.29], "marker": {"size": 8, "color": "#d97a4c"}, "line": {"color": "#d97a4c", "width": 2}, "hovertemplate": "iters=%{x}<br>Rust (luminal): %{y:.1f} ms<extra></extra>"}], {"title": "TTFT", "xaxis": {"title": "Search iterations", "type": "log", "tickvals": [10, 100, 500], "ticktext": ["10", "100", "500"]}, "yaxis": {"title": "ms", "rangemode": "tozero"}, "legend": {"orientation": "h", "y": -0.25}, "margin": {"t": 50, "b": 90}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script><div id="fig10" class="chart"></div><script>Plotly.newPlot("fig10", [{"type": "scatter", "mode": "lines+markers", "name": "luminal backend", "x": [10, 100, 500], "y": [25.875402649398893, 25.884080055402592, 27.492373346467502], "marker": {"size": 8, "color": "#4c9ed9"}, "line": {"color": "#4c9ed9", "width": 2}, "hovertemplate": "iters=%{x}<br>luminal backend: %{y:.1f} ms<extra></extra>"}, {"type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "x": [10, 100, 500], "y": [40.64, 39.98, 55.37], "marker": {"size": 8, "color": "#d97a4c"}, "line": {"color": "#d97a4c", "width": 2}, "hovertemplate": "iters=%{x}<br>Rust (luminal): %{y:.1f} ms<extra></extra>"}], {"title": "TPOT", "xaxis": {"title": "Search iterations", "type": "log", "tickvals": [10, 100, 500], "ticktext": ["10", "100", "500"]}, "yaxis": {"title": "ms", "rangemode": "tozero"}, "legend": {"orientation": "h", "y": -0.25}, "margin": {"t": 50, "b": 90}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script><div id="fig11" class="chart"></div><script>Plotly.newPlot("fig11", [{"type": "scatter", "mode": "lines+markers", "name": "luminal backend", "x": [10, 100, 500], "y": [37.92102829599753, 54.08867314597592, 118.29659596900456], "marker": {"size": 8, "color": "#4c9ed9"}, "line": {"color": "#4c9ed9", "width": 2}, "hovertemplate": "iters=%{x}<br>luminal backend: %{y:.1f} sec<extra></extra>"}, {"type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "x": [10, 100, 500], "y": [12.448030000000001, 27.06796, 81.89342], "marker": {"size": 8, "color": "#d97a4c"}, "line": {"color": "#d97a4c", "width": 2}, "hovertemplate": "iters=%{x}<br>Rust (luminal): %{y:.1f} sec<extra></extra>"}], {"title": "Time to Search", "xaxis": {"title": "Search iterations", "type": "log", "tickvals": [10, 100, 500], "ticktext": ["10", "100", "500"]}, "yaxis": {"title": "sec", "rangemode": "tozero"}, "legend": {"orientation": "h", "y": -0.25}, "margin": {"t": 50, "b": 90}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script></div>
+  <table><thead><tr><th>Path</th><th>Config</th><th>TTFT</th><th>TPOT</th><th>Throughput</th><th>Compile</th><th>Prompt tokens</th><th>Note</th></tr></thead><tbody><tr><td>luminal backend</td><td>s=10</td><td>465.0 ms</td><td>25.9 ms</td><td>38.6 tok/s</td><td>37921 ms</td><td>19</td><td style="color:#777">sequential per-token, StaticCache KV cache</td></tr>
+<tr><td>Rust (luminal)</td><td>s=10</td><td>398.4 ms</td><td>40.6 ms</td><td>24.6 tok/s</td><td>12448 ms</td><td>11</td><td style="color:#777">sum of per-token prefill durations</td></tr>
+<tr><td>luminal backend</td><td>s=100</td><td>465.9 ms</td><td>25.9 ms</td><td>38.6 tok/s</td><td>54089 ms</td><td>19</td><td style="color:#777">sequential per-token, StaticCache KV cache</td></tr>
+<tr><td>Rust (luminal)</td><td>s=100</td><td>390.1 ms</td><td>40.0 ms</td><td>25.0 tok/s</td><td>27068 ms</td><td>11</td><td style="color:#777">sum of per-token prefill durations</td></tr>
+<tr><td>luminal backend</td><td>s=500</td><td>495.8 ms</td><td>27.5 ms</td><td>36.4 tok/s</td><td>118297 ms</td><td>19</td><td style="color:#777">sequential per-token, StaticCache KV cache</td></tr>
+<tr><td>Rust (luminal)</td><td>s=500</td><td>559.3 ms</td><td>55.4 ms</td><td>18.1 tok/s</td><td>81893 ms</td><td>11</td><td style="color:#777">sum of per-token prefill durations</td></tr></tbody></table>
+</section>
+
+<section id="gemma3_4b_comparison">
+  <h2>Gemma3-4B Comparison</h2>
+  <p class="meta">unsloth/gemma-3-4b-it · 19/11 prompt tokens · 4 results</p>
+  <div class="charts-row"><div id="fig12" class="chart"></div><script>Plotly.newPlot("fig12", [{"type": "bar", "name": "HF Baseline", "x": ["gemma3-4b"], "y": [951.1196144158021], "text": ["951.1 ms"], "textposition": "outside", "marker": {"color": "#888888"}, "hovertemplate": "%{x}<br>HF Baseline: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "torch.compile", "x": ["gemma3-4b"], "y": [300.9451600664761], "text": ["300.9 ms"], "textposition": "outside", "marker": {"color": "#5ab552"}, "hovertemplate": "%{x}<br>torch.compile: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "Rust (luminal)", "x": ["gemma3-4b"], "y": [404.43], "text": ["404.4 ms"], "textposition": "outside", "marker": {"color": "#d97a4c"}, "hovertemplate": "%{x}<br>Rust (luminal): %{y:.1f} ms<extra></extra>"}], {"title": "TTFT", "yaxis": {"title": "ms", "rangemode": "tozero"}, "barmode": "group", "legend": {"orientation": "h", "y": -0.2}, "margin": {"t": 50, "b": 80}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script><div id="fig13" class="chart"></div><script>Plotly.newPlot("fig13", [{"type": "bar", "name": "HF Baseline", "x": ["gemma3-4b"], "y": [52.498737201676704], "text": ["52.5 ms"], "textposition": "outside", "marker": {"color": "#888888"}, "hovertemplate": "%{x}<br>HF Baseline: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "torch.compile", "x": ["gemma3-4b"], "y": [2197.426627812092], "text": ["2197.4 ms"], "textposition": "outside", "marker": {"color": "#5ab552"}, "hovertemplate": "%{x}<br>torch.compile: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "Rust (luminal)", "x": ["gemma3-4b"], "y": [38.99], "text": ["39.0 ms"], "textposition": "outside", "marker": {"color": "#d97a4c"}, "hovertemplate": "%{x}<br>Rust (luminal): %{y:.1f} ms<extra></extra>"}], {"title": "TPOT", "yaxis": {"title": "ms", "rangemode": "tozero"}, "barmode": "group", "legend": {"orientation": "h", "y": -0.2}, "margin": {"t": 50, "b": 80}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script><div id="fig14" class="chart"></div><script>Plotly.newPlot("fig14", [{"type": "bar", "name": "torch.compile", "x": ["gemma3-4b"], "y": [26.649526304972824], "text": ["26.6 sec"], "textposition": "outside", "marker": {"color": "#5ab552"}, "hovertemplate": "%{x}<br>torch.compile: %{y:.1f} sec<extra></extra>"}, {"type": "bar", "name": "Rust (luminal)", "x": ["gemma3-4b"], "y": [156.84164], "text": ["156.8 sec"], "textposition": "outside", "marker": {"color": "#d97a4c"}, "hovertemplate": "%{x}<br>Rust (luminal): %{y:.1f} sec<extra></extra>"}], {"title": "Time to Search", "yaxis": {"title": "sec", "rangemode": "tozero"}, "barmode": "group", "legend": {"orientation": "h", "y": -0.2}, "margin": {"t": 50, "b": 80}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script></div>
+  <table><thead><tr><th>Path</th><th>Config</th><th>TTFT</th><th>TPOT</th><th>Throughput</th><th>Compile</th><th>Prompt tokens</th><th>Note</th></tr></thead><tbody><tr><td>HF Baseline</td><td>gemma3-4b</td><td>951.1 ms</td><td>52.5 ms</td><td>19.0 tok/s</td><td>—</td><td>19</td><td style="color:#777">sequential per-token, StaticCache KV cache</td></tr>
+<tr><td>torch.compile</td><td>gemma3-4b</td><td>300.9 ms</td><td>2197.4 ms</td><td>0.5 tok/s</td><td>26650 ms</td><td>19</td><td style="color:#777">sequential per-token, StaticCache KV cache (torch.compile inductor)</td></tr>
+<tr style="background:#fff0f0"><td>luminal backend</td><td>gemma3-4b</td><td>—</td><td>—</td><td>—</td><td>—</td><td>—</td><td style="color:#c00">bench_python_luminal.py failed with code 1</td></tr>
+<tr><td>Rust (luminal)</td><td>gemma3-4b</td><td>404.4 ms</td><td>39.0 ms</td><td>25.6 tok/s</td><td>156842 ms</td><td>11</td><td style="color:#777">sum of per-token prefill durations</td></tr></tbody></table>
+</section>
+
+<section id="gemma3_4b_sweep">
+  <h2>Gemma3-4B Sweep</h2>
+  <p class="meta">unsloth/gemma-3-4b-it · 11 prompt tokens · 6 results</p>
+  <div class="charts-row"><div id="fig15" class="chart"></div><script>Plotly.newPlot("fig15", [{"type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "x": [10, 100, 500], "y": [388.19, 436.49, 386.13], "marker": {"size": 8, "color": "#d97a4c"}, "line": {"color": "#d97a4c", "width": 2}, "hovertemplate": "iters=%{x}<br>Rust (luminal): %{y:.1f} ms<extra></extra>"}], {"title": "TTFT", "xaxis": {"title": "Search iterations", "type": "log", "tickvals": [10, 100, 500], "ticktext": ["10", "100", "500"]}, "yaxis": {"title": "ms", "rangemode": "tozero"}, "legend": {"orientation": "h", "y": -0.25}, "margin": {"t": 50, "b": 90}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script><div id="fig16" class="chart"></div><script>Plotly.newPlot("fig16", [{"type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "x": [10, 100, 500], "y": [37.47, 41.95, 37.25], "marker": {"size": 8, "color": "#d97a4c"}, "line": {"color": "#d97a4c", "width": 2}, "hovertemplate": "iters=%{x}<br>Rust (luminal): %{y:.1f} ms<extra></extra>"}], {"title": "TPOT", "xaxis": {"title": "Search iterations", "type": "log", "tickvals": [10, 100, 500], "ticktext": ["10", "100", "500"]}, "yaxis": {"title": "ms", "rangemode": "tozero"}, "legend": {"orientation": "h", "y": -0.25}, "margin": {"t": 50, "b": 90}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script><div id="fig17" class="chart"></div><script>Plotly.newPlot("fig17", [{"type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "x": [10, 100, 500], "y": [102.18644, 186.34269, 498.48983000000004], "marker": {"size": 8, "color": "#d97a4c"}, "line": {"color": "#d97a4c", "width": 2}, "hovertemplate": "iters=%{x}<br>Rust (luminal): %{y:.1f} sec<extra></extra>"}], {"title": "Time to Search", "xaxis": {"title": "Search iterations", "type": "log", "tickvals": [10, 100, 500], "ticktext": ["10", "100", "500"]}, "yaxis": {"title": "sec", "rangemode": "tozero"}, "legend": {"orientation": "h", "y": -0.25}, "margin": {"t": 50, "b": 90}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script></div>
+  <table><thead><tr><th>Path</th><th>Config</th><th>TTFT</th><th>TPOT</th><th>Throughput</th><th>Compile</th><th>Prompt tokens</th><th>Note</th></tr></thead><tbody><tr style="background:#fff0f0"><td>luminal backend</td><td>s=10</td><td>—</td><td>—</td><td>—</td><td>—</td><td>—</td><td style="color:#c00">bench_python_luminal.py failed with code 1</td></tr>
+<tr><td>Rust (luminal)</td><td>s=10</td><td>388.2 ms</td><td>37.5 ms</td><td>26.7 tok/s</td><td>102186 ms</td><td>11</td><td style="color:#777">sum of per-token prefill durations</td></tr>
+<tr style="background:#fff0f0"><td>luminal backend</td><td>s=100</td><td>—</td><td>—</td><td>—</td><td>—</td><td>—</td><td style="color:#c00">bench_python_luminal.py failed with code 1</td></tr>
+<tr><td>Rust (luminal)</td><td>s=100</td><td>436.5 ms</td><td>42.0 ms</td><td>23.8 tok/s</td><td>186343 ms</td><td>11</td><td style="color:#777">sum of per-token prefill durations</td></tr>
+<tr style="background:#fff0f0"><td>luminal backend</td><td>s=500</td><td>—</td><td>—</td><td>—</td><td>—</td><td>—</td><td style="color:#c00">bench_python_luminal.py failed with code 1</td></tr>
+<tr><td>Rust (luminal)</td><td>s=500</td><td>386.1 ms</td><td>37.2 ms</td><td>26.8 tok/s</td><td>498490 ms</td><td>11</td><td style="color:#777">sum of per-token prefill durations</td></tr></tbody></table>
+</section>
+
+<section id="gemma4_moe_comparison">
+  <h2>Gemma4-Moe Comparison</h2>
+  <p class="meta">google/gemma-4-26B-A4B · 11 prompt tokens · 4 results</p>
+  <div class="charts-row"><div id="fig18" class="chart"></div><script>Plotly.newPlot("fig18", [{"type": "bar", "name": "HF Baseline", "x": ["gemma4-moe"], "y": [837.3980740143452], "text": ["837.4 ms"], "textposition": "outside", "marker": {"color": "#888888"}, "hovertemplate": "%{x}<br>HF Baseline: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "torch.compile", "x": ["gemma4-moe"], "y": [245.510076492792], "text": ["245.5 ms"], "textposition": "outside", "marker": {"color": "#5ab552"}, "hovertemplate": "%{x}<br>torch.compile: %{y:.1f} ms<extra></extra>"}], {"title": "TTFT", "yaxis": {"title": "ms", "rangemode": "tozero"}, "barmode": "group", "legend": {"orientation": "h", "y": -0.2}, "margin": {"t": 50, "b": 80}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script><div id="fig19" class="chart"></div><script>Plotly.newPlot("fig19", [{"type": "bar", "name": "HF Baseline", "x": ["gemma4-moe"], "y": [83.64427039632574], "text": ["83.6 ms"], "textposition": "outside", "marker": {"color": "#888888"}, "hovertemplate": "%{x}<br>HF Baseline: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "torch.compile", "x": ["gemma4-moe"], "y": [654.9649795080768], "text": ["655.0 ms"], "textposition": "outside", "marker": {"color": "#5ab552"}, "hovertemplate": "%{x}<br>torch.compile: %{y:.1f} ms<extra></extra>"}], {"title": "TPOT", "yaxis": {"title": "ms", "rangemode": "tozero"}, "barmode": "group", "legend": {"orientation": "h", "y": -0.2}, "margin": {"t": 50, "b": 80}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script><div id="fig20" class="chart"></div><script>Plotly.newPlot("fig20", [{"type": "bar", "name": "torch.compile", "x": ["gemma4-moe"], "y": [38.81582092499593], "text": ["38.8 sec"], "textposition": "outside", "marker": {"color": "#5ab552"}, "hovertemplate": "%{x}<br>torch.compile: %{y:.1f} sec<extra></extra>"}], {"title": "Time to Search", "yaxis": {"title": "sec", "rangemode": "tozero"}, "barmode": "group", "legend": {"orientation": "h", "y": -0.2}, "margin": {"t": 50, "b": 80}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script></div>
+  <table><thead><tr><th>Path</th><th>Config</th><th>TTFT</th><th>TPOT</th><th>Throughput</th><th>Compile</th><th>Prompt tokens</th><th>Note</th></tr></thead><tbody><tr><td>HF Baseline</td><td>gemma4-moe</td><td>837.4 ms</td><td>83.6 ms</td><td>12.0 tok/s</td><td>—</td><td>11</td><td style="color:#777">sequential per-token, StaticCache KV cache</td></tr>
+<tr><td>torch.compile</td><td>gemma4-moe</td><td>245.5 ms</td><td>655.0 ms</td><td>1.5 tok/s</td><td>38816 ms</td><td>11</td><td style="color:#777">sequential per-token, StaticCache KV cache (torch.compile inductor)</td></tr>
+<tr style="background:#fff0f0"><td>luminal backend</td><td>gemma4-moe</td><td>—</td><td>—</td><td>—</td><td>—</td><td>—</td><td style="color:#c00">bench_python_luminal.py failed with code -9</td></tr>
+<tr style="background:#fff0f0"><td>Rust (luminal)</td><td>gemma4-moe</td><td>—</td><td>—</td><td>—</td><td>—</td><td>—</td><td style="color:#c00">rust bench failed with code -9</td></tr></tbody></table>
+</section>
+
+<section id="gemma4_moe_sweep">
+  <h2>Gemma4-Moe Sweep</h2>
+  <p class="meta">google/gemma-4-26B-A4B · 2 results</p>
+  <div class="charts-row"><div id="fig21" class="chart"></div><script>Plotly.newPlot("fig21", [], {"title": "TTFT", "xaxis": {"title": "Search iterations", "type": "log", "tickvals": [10], "ticktext": ["10"]}, "yaxis": {"title": "ms", "rangemode": "tozero"}, "legend": {"orientation": "h", "y": -0.25}, "margin": {"t": 50, "b": 90}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script></div>
+  <table><thead><tr><th>Path</th><th>Config</th><th>TTFT</th><th>TPOT</th><th>Throughput</th><th>Compile</th><th>Prompt tokens</th><th>Note</th></tr></thead><tbody><tr style="background:#fff0f0"><td>luminal backend</td><td>s=10</td><td>—</td><td>—</td><td>—</td><td>—</td><td>—</td><td style="color:#c00">bench_python_luminal.py failed with code -9</td></tr>
+<tr style="background:#fff0f0"><td>Rust (luminal)</td><td>s=10</td><td>—</td><td>—</td><td>—</td><td>—</td><td>—</td><td style="color:#c00">rust bench failed with code -9</td></tr></tbody></table>
+</section>
+
+<section id="qwen3_moe_comparison">
+  <h2>Qwen3-Moe Comparison</h2>
+  <p class="meta">Qwen/Qwen3-30B-A3B · 19 prompt tokens · 4 results</p>
+  <div class="charts-row"><div id="fig22" class="chart"></div><script>Plotly.newPlot("fig22", [{"type": "bar", "name": "HF Baseline", "x": ["qwen3-moe"], "y": [1565.540504961973], "text": ["1565.5 ms"], "textposition": "outside", "marker": {"color": "#888888"}, "hovertemplate": "%{x}<br>HF Baseline: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "torch.compile", "x": ["qwen3-moe"], "y": [460.077923577046], "text": ["460.1 ms"], "textposition": "outside", "marker": {"color": "#5ab552"}, "hovertemplate": "%{x}<br>torch.compile: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "luminal backend", "x": ["qwen3-moe"], "y": [21002.791983017232], "text": ["21002.8 ms"], "textposition": "outside", "marker": {"color": "#4c9ed9"}, "hovertemplate": "%{x}<br>luminal backend: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "Rust (luminal)", "x": ["qwen3-moe"], "y": [662.07], "text": ["662.1 ms"], "textposition": "outside", "marker": {"color": "#d97a4c"}, "hovertemplate": "%{x}<br>Rust (luminal): %{y:.1f} ms<extra></extra>"}], {"title": "TTFT", "yaxis": {"title": "ms", "rangemode": "tozero"}, "barmode": "group", "legend": {"orientation": "h", "y": -0.2}, "margin": {"t": 50, "b": 80}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script><div id="fig23" class="chart"></div><script>Plotly.newPlot("fig23", [{"type": "bar", "name": "HF Baseline", "x": ["qwen3-moe"], "y": [84.527321747737], "text": ["84.5 ms"], "textposition": "outside", "marker": {"color": "#888888"}, "hovertemplate": "%{x}<br>HF Baseline: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "torch.compile", "x": ["qwen3-moe"], "y": [753.0061075551203], "text": ["753.0 ms"], "textposition": "outside", "marker": {"color": "#5ab552"}, "hovertemplate": "%{x}<br>torch.compile: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "luminal backend", "x": ["qwen3-moe"], "y": [1166.8824461026816], "text": ["1166.9 ms"], "textposition": "outside", "marker": {"color": "#4c9ed9"}, "hovertemplate": "%{x}<br>luminal backend: %{y:.1f} ms<extra></extra>"}, {"type": "bar", "name": "Rust (luminal)", "x": ["qwen3-moe"], "y": [60.08], "text": ["60.1 ms"], "textposition": "outside", "marker": {"color": "#d97a4c"}, "hovertemplate": "%{x}<br>Rust (luminal): %{y:.1f} ms<extra></extra>"}], {"title": "TPOT", "yaxis": {"title": "ms", "rangemode": "tozero"}, "barmode": "group", "legend": {"orientation": "h", "y": -0.2}, "margin": {"t": 50, "b": 80}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script><div id="fig24" class="chart"></div><script>Plotly.newPlot("fig24", [{"type": "bar", "name": "torch.compile", "x": ["qwen3-moe"], "y": [8.341281775035895], "text": ["8.3 sec"], "textposition": "outside", "marker": {"color": "#5ab552"}, "hovertemplate": "%{x}<br>torch.compile: %{y:.1f} sec<extra></extra>"}, {"type": "bar", "name": "luminal backend", "x": ["qwen3-moe"], "y": [111.70731823903043], "text": ["111.7 sec"], "textposition": "outside", "marker": {"color": "#4c9ed9"}, "hovertemplate": "%{x}<br>luminal backend: %{y:.1f} sec<extra></extra>"}, {"type": "bar", "name": "Rust (luminal)", "x": ["qwen3-moe"], "y": [80.83241000000001], "text": ["80.8 sec"], "textposition": "outside", "marker": {"color": "#d97a4c"}, "hovertemplate": "%{x}<br>Rust (luminal): %{y:.1f} sec<extra></extra>"}], {"title": "Time to Search", "yaxis": {"title": "sec", "rangemode": "tozero"}, "barmode": "group", "legend": {"orientation": "h", "y": -0.2}, "margin": {"t": 50, "b": 80}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script></div>
+  <table><thead><tr><th>Path</th><th>Config</th><th>TTFT</th><th>TPOT</th><th>Throughput</th><th>Compile</th><th>Prompt tokens</th><th>Note</th></tr></thead><tbody><tr><td>HF Baseline</td><td>qwen3-moe</td><td>1565.5 ms</td><td>84.5 ms</td><td>11.8 tok/s</td><td>—</td><td>19</td><td style="color:#777">sequential per-token, StaticCache KV cache</td></tr>
+<tr><td>torch.compile</td><td>qwen3-moe</td><td>460.1 ms</td><td>753.0 ms</td><td>1.3 tok/s</td><td>8341 ms</td><td>19</td><td style="color:#777">sequential per-token, StaticCache KV cache (torch.compile inductor)</td></tr>
+<tr><td>luminal backend</td><td>qwen3-moe</td><td>21002.8 ms</td><td>1166.9 ms</td><td>0.9 tok/s</td><td>111707 ms</td><td>19</td><td style="color:#777">sequential per-token, StaticCache KV cache</td></tr>
+<tr><td>Rust (luminal)</td><td>qwen3-moe</td><td>662.1 ms</td><td>60.1 ms</td><td>16.6 tok/s</td><td>80832 ms</td><td>—</td><td style="color:#777">sum of per-token prefill durations</td></tr></tbody></table>
+</section>
+
+<section id="qwen3_moe_sweep">
+  <h2>Qwen3-Moe Sweep</h2>
+  <p class="meta">Qwen/Qwen3-30B-A3B · 19 prompt tokens · 6 results</p>
+  <div class="charts-row"><div id="fig25" class="chart"></div><script>Plotly.newPlot("fig25", [{"type": "scatter", "mode": "lines+markers", "name": "luminal backend", "x": [10, 100, 500], "y": [21002.663500519702, 21018.686580006033, 21034.366824431345], "marker": {"size": 8, "color": "#4c9ed9"}, "line": {"color": "#4c9ed9", "width": 2}, "hovertemplate": "iters=%{x}<br>luminal backend: %{y:.1f} ms<extra></extra>"}, {"type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "x": [10, 100, 500], "y": [656.7, 540.37, 542.34], "marker": {"size": 8, "color": "#d97a4c"}, "line": {"color": "#d97a4c", "width": 2}, "hovertemplate": "iters=%{x}<br>Rust (luminal): %{y:.1f} ms<extra></extra>"}], {"title": "TTFT", "xaxis": {"title": "Search iterations", "type": "log", "tickvals": [10, 100, 500], "ticktext": ["10", "100", "500"]}, "yaxis": {"title": "ms", "rangemode": "tozero"}, "legend": {"orientation": "h", "y": -0.25}, "margin": {"t": 50, "b": 90}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script><div id="fig26" class="chart"></div><script>Plotly.newPlot("fig26", [{"type": "scatter", "mode": "lines+markers", "name": "luminal backend", "x": [10, 100, 500], "y": [1166.6714247548953, 1167.2746865515364, 1168.7990181031637], "marker": {"size": 8, "color": "#4c9ed9"}, "line": {"color": "#4c9ed9", "width": 2}, "hovertemplate": "iters=%{x}<br>luminal backend: %{y:.1f} ms<extra></extra>"}, {"type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "x": [10, 100, 500], "y": [59.6, 48.79, 48.88], "marker": {"size": 8, "color": "#d97a4c"}, "line": {"color": "#d97a4c", "width": 2}, "hovertemplate": "iters=%{x}<br>Rust (luminal): %{y:.1f} ms<extra></extra>"}], {"title": "TPOT", "xaxis": {"title": "Search iterations", "type": "log", "tickvals": [10, 100, 500], "ticktext": ["10", "100", "500"]}, "yaxis": {"title": "ms", "rangemode": "tozero"}, "legend": {"orientation": "h", "y": -0.25}, "margin": {"t": 50, "b": 90}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script><div id="fig27" class="chart"></div><script>Plotly.newPlot("fig27", [{"type": "scatter", "mode": "lines+markers", "name": "luminal backend", "x": [10, 100, 500], "y": [93.47603664599592, 132.266081985028, 298.05094401398674], "marker": {"size": 8, "color": "#4c9ed9"}, "line": {"color": "#4c9ed9", "width": 2}, "hovertemplate": "iters=%{x}<br>luminal backend: %{y:.1f} sec<extra></extra>"}, {"type": "scatter", "mode": "lines+markers", "name": "Rust (luminal)", "x": [10, 100, 500], "y": [25.48138, 47.5342, 134.79345], "marker": {"size": 8, "color": "#d97a4c"}, "line": {"color": "#d97a4c", "width": 2}, "hovertemplate": "iters=%{x}<br>Rust (luminal): %{y:.1f} sec<extra></extra>"}], {"title": "Time to Search", "xaxis": {"title": "Search iterations", "type": "log", "tickvals": [10, 100, 500], "ticktext": ["10", "100", "500"]}, "yaxis": {"title": "sec", "rangemode": "tozero"}, "legend": {"orientation": "h", "y": -0.25}, "margin": {"t": 50, "b": 90}, "plot_bgcolor": "#fafafa", "paper_bgcolor": "#ffffff"}, {responsive:true});</script></div>
+  <table><thead><tr><th>Path</th><th>Config</th><th>TTFT</th><th>TPOT</th><th>Throughput</th><th>Compile</th><th>Prompt tokens</th><th>Note</th></tr></thead><tbody><tr><td>luminal backend</td><td>s=10</td><td>21002.7 ms</td><td>1166.7 ms</td><td>0.9 tok/s</td><td>93476 ms</td><td>19</td><td style="color:#777">sequential per-token, StaticCache KV cache</td></tr>
+<tr><td>Rust (luminal)</td><td>s=10</td><td>656.7 ms</td><td>59.6 ms</td><td>16.8 tok/s</td><td>25481 ms</td><td>—</td><td style="color:#777">sum of per-token prefill durations</td></tr>
+<tr><td>luminal backend</td><td>s=100</td><td>21018.7 ms</td><td>1167.3 ms</td><td>0.9 tok/s</td><td>132266 ms</td><td>19</td><td style="color:#777">sequential per-token, StaticCache KV cache</td></tr>
+<tr><td>Rust (luminal)</td><td>s=100</td><td>540.4 ms</td><td>48.8 ms</td><td>20.5 tok/s</td><td>47534 ms</td><td>—</td><td style="color:#777">sum of per-token prefill durations</td></tr>
+<tr><td>luminal backend</td><td>s=500</td><td>21034.4 ms</td><td>1168.8 ms</td><td>0.9 tok/s</td><td>298051 ms</td><td>19</td><td style="color:#777">sequential per-token, StaticCache KV cache</td></tr>
+<tr><td>Rust (luminal)</td><td>s=500</td><td>542.3 ms</td><td>48.9 ms</td><td>20.5 tok/s</td><td>134793 ms</td><td>—</td><td style="color:#777">sum of per-token prefill durations</td></tr></tbody></table>
+</section>
+</main>
+</body>
+</html>
--- a/benchmarks/ttft/run.py
+++ b/benchmarks/ttft/run.py
@@ -0,0 +1,683 @@
+"""TTFT + TPOT benchmark orchestrator.
+
+Runs four paths in isolated subprocesses:
+  1. python_baseline       — HuggingFace / PyTorch eager on CUDA
+  2. python_torch_compile  — torch.compile(model) inductor backend
+  3. python_luminal        — torch.compile(model, backend=luminal_backend)
+  4. rust                  — examples/<package> binary (luminal_cuda_lite)
+
+Use --config to select a named configuration, or --all-configs to run every
+entry in CONFIGS. All output is written to the SQLite bench DB
+(benchmarks/ttft/bench.db); the TUI / dashboard / report read from there.
+
+Notes on comparability:
+  - python_baseline: single chunked forward for TTFT; KV-cache decode for TPOT.
+  - python_torch_compile: inductor, same chunked prefill as baseline; first
+    call triggers JIT compilation (recorded separately as compile_ms).
+  - python_luminal: sequential per-token prefill with StaticCache; TPOT via
+    autoregressive decode steps.
+  - rust: sequential per-token prefill; TTFT = sum of prefill step durations.
+Steady-state execution only — compile / egraph-search time excluded from TTFT but
+recorded separately as compile_ms for all paths that support it.
+"""
+
+import argparse
+import datetime
+import json
+import os
+import re
+import subprocess
+import sys
+import time
+from pathlib import Path
+
+try:
+    import tomllib
+except ImportError:
+    try:
+        import tomli as tomllib  # type: ignore[no-redef]
+    except ImportError:
+        raise ImportError("Python 3.11+ or 'pip install tomli' required to load benchmarks.toml")
+
+import db
+
+BENCH_DIR = Path(__file__).resolve().parent
+REPO_ROOT = BENCH_DIR.parent.parent
+
+DEFAULT_PROMPT = "Explain what a neural network is in a paragraph."
+DEFAULT_MODEL = "NousResearch/Meta-Llama-3-8B-Instruct"
+
+_CONFIG_PATH = BENCH_DIR / "benchmarks.toml"
+with open(_CONFIG_PATH, "rb") as _f:
+    _BENCH_CONFIG = tomllib.load(_f)
+
+# Named benchmark configurations.  Each entry overrides any subset of the
+# CLI defaults; explicit CLI flags always take precedence over the config.
+CONFIGS: dict = _BENCH_CONFIG["configs"]
+UR_TEST_MODELS: list = _BENCH_CONFIG["ur_test"]["models"]
+SEARCH_SWEEP_ITERS: list = _BENCH_CONFIG["ur_test"]["search_sweep_iters"]
+
+SWEEP_CONFIG_PREFIX = "s="
+
+BENCH_LINE = re.compile(r"^BENCH_RESULT (.*)$", re.MULTILINE)
+RUST_TTFT_LINE = re.compile(r"TTFT:\s*([0-9]+\.?[0-9]*)\s*ms")
+RUST_TPOT_LINE = re.compile(r"TPOT:\s*([0-9]+\.?[0-9]*)\s*ms")
+RUST_COMPILE_LINE = re.compile(r"COMPILE:\s*([0-9]+\.?[0-9]*)\s*ms")
+RUST_PROMPT_LINE = re.compile(r"Prompt:\s*(\d+)\s*tokens")
+
+
+def _stream(proc, tee_prefix):
+    """Drain subprocess stdout, tee-ing to our stdout line-by-line. Returns full stdout."""
+    buf = []
+    assert proc.stdout is not None
+    for line in proc.stdout:
+        buf.append(line)
+        sys.stdout.write(f"[{tee_prefix}] {line}")
+        sys.stdout.flush()
+    proc.wait()
+    return "".join(buf)
+
+
+_MEM_LOG_PATH = os.environ.get("BENCH_MEM_LOG", "/tmp/bench_mem_snapshots.log")
+
+
+def _snapshot_memory(label: str) -> None:
+    """Append a host+GPU memory snapshot to BENCH_MEM_LOG. Cheap, never raises."""
+    try:
+        ts = datetime.datetime.now().isoformat(timespec="seconds")
+        meminfo_keys = ("MemTotal", "MemFree", "MemAvailable", "Cached", "Slab", "SReclaimable")
+        meminfo = {}
+        with open("/proc/meminfo") as f:
+            for line in f:
+                k, _, rest = line.partition(":")
+                if k in meminfo_keys:
+                    meminfo[k] = rest.strip().split()[0]  # kB
+        try:
+            gpu = subprocess.check_output(
+                ["nvidia-smi", "--query-gpu=memory.used,memory.free,memory.total",
+                 "--format=csv,noheader,nounits"],
+                stderr=subprocess.DEVNULL, text=True, timeout=5,
+            ).strip().splitlines()[0]
+        except Exception:
+            gpu = "n/a"
+        parent_rss = "?"
+        try:
+            with open(f"/proc/{os.getpid()}/status") as f:
+                for line in f:
+                    if line.startswith("VmRSS:"):
+                        parent_rss = line.split()[1]
+                        break
+        except Exception:
+            pass
+        host_str = " ".join(f"{k}={meminfo.get(k, '?')}kB" for k in meminfo_keys)
+        with open(_MEM_LOG_PATH, "a") as f:
+            f.write(f"{ts}  [{label}]  parent_rss={parent_rss}kB  {host_str}  gpu(used,free,total MiB)={gpu}\n")
+    except Exception as e:
+        sys.stderr.write(f"[mem-snapshot warn] {e}\n")
+
+
+def _cargo_env():
+    """Return env dict with ~/.cargo/bin prepended to PATH."""
+    cargo_bin = str(Path.home() / ".cargo" / "bin")
+    path = os.environ.get("PATH", "")
+    if cargo_bin not in path:
+        path = f"{cargo_bin}:{path}"
+    return {**os.environ, "PATH": path}
+
+
+def run_rust(_prompt, package="llama", env_vars=None):
+    print(f"\n=== Running: rust (examples/{package}) ===", flush=True)
+    cmd = ["cargo", "run", "--release", "-p", package]
+    env = _cargo_env()
+    if env_vars:
+        env.update(env_vars)
+    proc = subprocess.Popen(
+        cmd,
+        cwd=REPO_ROOT,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
+        bufsize=1,
+        env=env,
+    )
+    output = _stream(proc, "rust")
+    if proc.returncode != 0:
+        raise RuntimeError(f"rust bench failed with code {proc.returncode}")
+    m = RUST_TTFT_LINE.search(output)
+    if not m:
+        raise RuntimeError("could not find 'TTFT: X ms' in rust stdout")
+    ttft_ms = float(m.group(1))
+    result = {
+        "path": "rust",
+        "model": DEFAULT_MODEL,
+        "ttft_ms": ttft_ms,
+        "note": "sum of per-token prefill durations",
+    }
+    m_compile = RUST_COMPILE_LINE.search(output)
+    if m_compile:
+        result["compile_ms"] = float(m_compile.group(1))
+    m_tpot = RUST_TPOT_LINE.search(output)
+    if m_tpot:
+        tpot_ms = float(m_tpot.group(1))
+        result["tpot_ms"] = tpot_ms
+        result["throughput_tps"] = 1000.0 / tpot_ms
+    m_prompt = RUST_PROMPT_LINE.search(output)
+    if m_prompt:
+        result["prompt_tokens"] = int(m_prompt.group(1))
+    return result
+
+
+def run_python_script(name, extra_args):
+    script = BENCH_DIR / name
+    print(f"\n=== Running: {script.name} ===", flush=True)
+    cmd = [sys.executable, str(script), *extra_args]
+    proc = subprocess.Popen(
+        cmd,
+        cwd=REPO_ROOT,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
+        bufsize=1,
+        env={**os.environ},
+    )
+    output = _stream(proc, script.stem)
+    if proc.returncode != 0:
+        raise RuntimeError(f"{script.name} failed with code {proc.returncode}")
+    m = BENCH_LINE.search(output)
+    if not m:
+        raise RuntimeError(f"no BENCH_RESULT line in {script.name} output")
+    return json.loads(m.group(1))
+
+
+PATH_ORDER = ["python_baseline", "python_torch_compile", "python_luminal", "rust"]
+PATH_LABELS = {
+    "python_baseline": "Python\n(HF baseline)",
+    "python_torch_compile": "Python\n(torch.compile)",
+    "python_luminal": "Python → Rust\n(luminal_backend)",
+    "rust": "Rust\n(examples/llama)",
+}
+PATH_COLORS = {
+    "python_baseline": "#888888",
+    "python_torch_compile": "#5ab552",
+    "python_luminal": "#4c9ed9",
+    "rust": "#d97a4c",
+}
+
+
+def run_one_config(config_name, settings, global_skip, inter_path_cooldown=0):
+    """Run all four paths for one config. Returns list of result dicts tagged with 'config'."""
+    model = settings["model"]
+    rust_package = settings["rust_package"]
+    prompt = settings["prompt"]
+    iters = settings["iters"]
+    warmups = settings["warmups"]
+    decode_tokens = settings["decode_tokens"]
+    search_iters = settings["search_iters"]
+    dtype = settings.get("dtype", "float32")
+    skip = set(global_skip) | set(settings.get("skip", []))
+
+    common_py = [
+        "--model", model,
+        "--prompt", prompt,
+        "--iters", str(iters),
+        "--warmups", str(warmups),
+        "--decode-tokens", str(decode_tokens),
+        "--dtype", dtype,
+    ]
+    luminal_py = common_py + ["--search-iters", str(search_iters)]
+
+    rust_env = {"SEARCH_GRAPHS": str(search_iters), "PROMPT": prompt, "ITERS": str(iters)}
+
+    results = []
+    first_path = True
+    for path, fn in [
+        ("python_baseline",      lambda: run_python_script("bench_python_baseline.py", common_py)),
+        ("python_torch_compile", lambda: run_python_script("bench_python_torch_compile.py", common_py)),
+        ("python_luminal",       lambda: run_python_script("bench_python_luminal.py", luminal_py)),
+        ("rust",                 lambda: run_rust(prompt, package=rust_package, env_vars=rust_env)),
+    ]:
+        if path in skip:
+            continue
+        if not first_path and inter_path_cooldown > 0:
+            print(f"  [cooldown {inter_path_cooldown}s]", flush=True)
+            time.sleep(inter_path_cooldown)
+        first_path = False
+        _snapshot_memory(f"{config_name}/{path} BEFORE")
+        try:
+            r = fn()
+            r["config"] = config_name
+            r["model"] = model  # ensure correct model is always tagged
+            if path in ("python_luminal", "rust"):
+                r["search_iters"] = search_iters
+            results.append(r)
+        except Exception as e:
+            print(f"\n[WARN] {config_name}/{path} failed: {e}", flush=True)
+            results.append({
+                "path": path,
+                "config": config_name,
+                "model": model,
+                "error": str(e),
+                "ttft_ms": None,
+            })
+        _snapshot_memory(f"{config_name}/{path} AFTER")
+    return results
+
+
+def plot(results, out_path):
+    import matplotlib
+
+    matplotlib.use("Agg")
+    import matplotlib.pyplot as plt
+
+    # Group by config so each config gets its own subplot column.
+    configs_seen: list[str] = []
+    by_config: dict[str, dict] = {}
+    for r in results:
+        cfg = r.get("config", "default")
+        if cfg not in by_config:
+            configs_seen.append(cfg)
+            by_config[cfg] = {}
+        by_config[cfg][r["path"]] = r
+
+    has_tpot = any(
+        r.get("tpot_ms") is not None
+        for r in results
+        if not r.get("error")
+    )
+    nrows = 2 if has_tpot else 1
+    ncols = len(configs_seen)
+    fig, axes = plt.subplots(nrows, ncols, figsize=(6 * ncols, 4.5 * nrows), squeeze=False)
+
+    for col, cfg in enumerate(configs_seen):
+        by_path = by_config[cfg]
+        present = [p for p in PATH_ORDER if p in by_path]
+
+        def _bar(ax, title, ylabel, key):
+            raw = [by_path[p].get(key) for p in present]
+            ys = [v if v is not None else 0.0 for v in raw]
+            cs = [PATH_COLORS.get(p, "#aaaaaa") if raw[i] is not None else "#cccccc"
+                  for i, p in enumerate(present)]
+            xs = [PATH_LABELS.get(p, p) for p in present]
+            bars = ax.bar(xs, ys, color=cs)
+            ax.set_ylabel(ylabel)
+            ax.set_title(f"{title} — {cfg}")
+            ax.grid(axis="y", alpha=0.3)
+            for b, v in zip(bars, raw):
+                if v is not None:
+                    ax.text(b.get_x() + b.get_width() / 2, v, f"{v:.0f} ms",
+                            ha="center", va="bottom", fontsize=9)
+
+        _bar(axes[0][col], "TTFT", "Time to first token (ms)", "ttft_ms")
+        if has_tpot:
+            _bar(axes[1][col], "TPOT", "Time per output token (ms)", "tpot_ms")
+
+    fig.tight_layout()
+    fig.savefig(out_path, dpi=150)
+    print(f"wrote {out_path}")
+
+
+def run_ur_test(args, conn, run_id):
+    """The ur-test: all 4 paths at default budget + full search sweep, for each model.
+
+    Inserts each result into the DB as it is produced so a mid-run crash still
+    leaves partial data behind.
+    """
+    all_results = []
+
+    for model_idx, model_key in enumerate(UR_TEST_MODELS):
+        s = _settings_for_config(model_key, args)
+
+        if model_idx > 0:
+            print(f"\n  [cooldown 30s between models]", flush=True)
+            time.sleep(30)
+
+        # ── Phase 1: comparison — all 4 paths at the model's default search budget ──
+        print(f"\n{'='*60}\nUR-TEST COMPARISON: {model_key}\n{'='*60}", flush=True)
+        comp_results = run_one_config(model_key, s, args.skip, inter_path_cooldown=20)
+        for r in comp_results:
+            r["model_key"] = model_key
+            db.insert_result(conn, run_id, r)
+        conn.commit()
+        all_results.extend(comp_results)
+
+        # ── Phase 2: search sweep — python_luminal + rust across all budgets ──
+        if args.no_sweep:
+            continue
+        print(f"\n{'='*60}\nUR-TEST SWEEP: {model_key}\n{'='*60}", flush=True)
+        sweep_skip_base = set(args.skip) | {"python_baseline", "python_torch_compile"}
+        # Memory peak in egglog Search grows monotonically with search-iters.
+        # If a path SIGKILLs (-9) at budget N, every higher budget will too —
+        # skip it to avoid wasting another ~hour per model on guaranteed OOMs.
+        oom_paths: set[str] = set()
+        for n in SEARCH_SWEEP_ITERS:
+            print(f"  [cooldown 20s before s={n}]", flush=True)
+            time.sleep(20)
+            sweep_skip = list(sweep_skip_base | oom_paths)
+            if oom_paths:
+                print(f"  [skip-on-prior-OOM] {sorted(oom_paths)} OOM'd at lower budget; skipping at s={n}", flush=True)
+            sweep_s = {**s, "search_iters": n}
+            results_n = run_one_config(f"s={n}", sweep_s, sweep_skip, inter_path_cooldown=20)
+            for r in results_n:
+                r["model_key"] = model_key  # preserve ur-test model identity for dashboard
+                db.insert_result(conn, run_id, r)
+                if "code -9" in (r.get("error") or ""):
+                    oom_paths.add(r["path"])
+            conn.commit()
+            all_results.extend(results_n)
+
+    print("\nGenerate report with:")
+    print(f"  python3 benchmarks/ttft/gen_report.py --db benchmarks/ttft/bench.db --run {run_id} \\")
+    print("    --out benchmarks/ttft/report.html")
+    print("\nGenerate dashboard with:")
+    print("  python3 benchmarks/ttft/gen_dashboard.py --out benchmarks/ttft/dashboard.html")
+
+    return all_results
+
+
+def _git_info():
+    """Return (short_commit, branch) from the repo, or ('unknown', 'unknown') if unavailable."""
+    try:
+        commit = subprocess.check_output(
+            ["git", "rev-parse", "--short", "HEAD"],
+            cwd=REPO_ROOT, stderr=subprocess.DEVNULL, text=True,
+        ).strip()
+        branch = subprocess.check_output(
+            ["git", "rev-parse", "--abbrev-ref", "HEAD"],
+            cwd=REPO_ROOT, stderr=subprocess.DEVNULL, text=True,
+        ).strip()
+        return commit, branch
+    except Exception:
+        return "unknown", "unknown"
+
+
+def _gpu_info() -> dict:
+    """Return GPU metadata from nvidia-smi, or empty dict if unavailable."""
+    try:
+        out = subprocess.check_output(
+            [
+                "nvidia-smi",
+                "--query-gpu=name,driver_version,memory.total",
+                "--format=csv,noheader,nounits",
+            ],
+            stderr=subprocess.DEVNULL,
+            text=True,
+        ).strip()
+        if not out:
+            return {}
+        parts = [p.strip() for p in out.splitlines()[0].split(",")]
+        if len(parts) < 3:
+            return {}
+        return {
+            "gpu_name": parts[0],
+            "gpu_driver": parts[1],
+            "gpu_vram_mb": int(parts[2]),
+        }
+    except Exception:
+        return {}
+
+
+def _cuda_version() -> str:
+    """Return CUDA version string from nvidia-smi, or 'unknown'."""
+    try:
+        out = subprocess.check_output(
+            ["nvidia-smi", "--query", "--display=COMPUTE"],
+            stderr=subprocess.DEVNULL,
+            text=True,
+        )
+        for line in out.splitlines():
+            if "CUDA Version" in line:
+                return line.split(":")[-1].strip()
+    except Exception:
+        pass
+    try:
+        out = subprocess.check_output(
+            ["nvidia-smi"], stderr=subprocess.DEVNULL, text=True
+        )
+        import re as _re
+        m = _re.search(r"CUDA Version:\s*([\d.]+)", out)
+        if m:
+            return m.group(1)
+    except Exception:
+        pass
+    return "unknown"
+
+
+def _record_run(conn, mode):
+    """Insert a `runs` row capturing this orchestrator invocation. Returns run_id.
+
+    Uses microsecond resolution in the run_id so two invocations within the
+    same wallclock second never collide on the runs PRIMARY KEY (insert_run
+    defaults to OR IGNORE, which would otherwise silently merge them and
+    corrupt history). Microseconds also let the dashboard plot back-to-back
+    runs at distinct x-positions instead of stacking them on one date label.
+    """
+    now = datetime.datetime.now()
+    run_id = now.strftime("%Y-%m-%dT%H-%M-%S-%f")
+    commit, branch = _git_info()
+    db.insert_run(
+        conn,
+        run_id=run_id,
+        timestamp=now.isoformat(),
+        mode=mode,
+        git_commit=commit,
+        git_branch=branch,
+        cuda_version=_cuda_version(),
+        **_gpu_info(),
+    )
+    conn.commit()
+    return run_id
+
+
+def _settings_from_args(args):
+    """Build a settings dict from parsed CLI args."""
+    return {
+        "model": args.model,
+        "rust_package": args.rust_package,
+        "prompt": args.prompt,
+        "iters": args.iters,
+        "warmups": args.warmups,
+        "decode_tokens": args.decode_tokens,
+        "search_iters": args.search_iters,
+        "dtype": args.dtype,
+        "skip": [],
+    }
+
+
+def _settings_for_config(config_name, args):
+    """Merge CONFIGS[config_name] over CLI arg defaults."""
+    cfg = CONFIGS[config_name]
+    return {
+        "model":        cfg.get("model",        args.model),
+        "rust_package": cfg.get("rust_package", args.rust_package),
+        "prompt":       cfg.get("prompt",       args.prompt),
+        "iters":        cfg.get("iters",        args.iters),
+        "warmups":      cfg.get("warmups",      args.warmups),
+        "decode_tokens":cfg.get("decode_tokens",args.decode_tokens),
+        "search_iters": cfg.get("search_iters", args.search_iters),
+        "dtype":        cfg.get("dtype",        args.dtype),
+        "skip":         cfg.get("skip",         []),
+    }
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument(
+        "--config",
+        choices=list(CONFIGS),
+        default=None,
+        help="Named benchmark configuration. Sets parameter defaults; explicit flags override.",
+    )
+    ap.add_argument(
+        "--all-configs",
+        action="store_true",
+        dest="all_configs",
+        help="Run every entry in CONFIGS into a single run_id in the DB.",
+    )
+    ap.add_argument(
+        "--search-sweep",
+        action="store_true",
+        dest="search_sweep",
+        help=(
+            "Run python_luminal + rust across all SEARCH_SWEEP_ITERS budgets "
+            f"({SEARCH_SWEEP_ITERS}). Uses --config (default: llama-8b) as the base settings."
+        ),
+    )
+    ap.add_argument(
+        "--skip-configs",
+        nargs="*",
+        default=[],
+        choices=list(CONFIGS),
+        dest="skip_configs",
+        metavar="CONFIG",
+        help="Config names to exclude when using --all-configs.",
+    )
+    ap.add_argument(
+        "--no-sweep",
+        action="store_true",
+        dest="no_sweep",
+        help=(
+            "With --ur-test: skip the search-budget sweep phase and only run "
+            "the 4-path comparison for each model. ~1.5 hr instead of ~5 hr."
+        ),
+    )
+    ap.add_argument("--model", default=DEFAULT_MODEL)
+    ap.add_argument("--rust-package", default="llama", dest="rust_package",
+                    help="Cargo package name for the rust bench (examples/<name>).")
+    ap.add_argument("--prompt", default=DEFAULT_PROMPT)
+    ap.add_argument("--iters", type=int, default=3)
+    ap.add_argument("--warmups", type=int, default=1)
+    ap.add_argument("--skip", nargs="*", default=[],
+                    choices=["rust", "python_luminal", "python_baseline", "python_torch_compile"])
+    ap.add_argument("--out", default=str(BENCH_DIR / "ttft.png"))
+    ap.add_argument("--db", default=str(db.DEFAULT_DB_PATH),
+                    help="SQLite database file (default: benchmarks/ttft/bench.db).")
+    ap.add_argument("--run", default=None, dest="run",
+                    help="With --render-only: run_id to render (default: latest).")
+    ap.add_argument(
+        "--decode-tokens", type=int, default=50,
+        help="Tokens to generate for TPOT measurement (0 = skip TPOT).",
+    )
+    ap.add_argument(
+        "--search-iters", type=int, default=500,
+        help="Egraph search iterations for the python_luminal path.",
+    )
+    ap.add_argument(
+        "--dtype", default="float32",
+        choices=["float32", "bfloat16", "float16"],
+        help="Torch dtype for the python paths. Configs may override per-model.",
+    )
+    ap.add_argument(
+        "--render-only", action="store_true",
+        help="Skip running benches; render an existing run from the DB. "
+             "Use --run RUN_ID to pick a specific run, otherwise the latest is used.",
+    )
+    ap.add_argument(
+        "--ur-test", action="store_true", dest="ur_test",
+        help=(
+            f"The mega-test: run all 4 paths at default budget + full search sweep "
+            f"({SEARCH_SWEEP_ITERS}) for each of {UR_TEST_MODELS}."
+        ),
+    )
+
+    # Pre-parse to apply named config as argparse defaults so explicit CLI
+    # flags still override them.
+    pre, _ = ap.parse_known_args()
+    if pre.config and not (pre.all_configs or getattr(pre, "search_sweep", False)):
+        cfg = CONFIGS[pre.config]
+        ap.set_defaults(**{k: v for k, v in cfg.items() if k not in ("skip",)})
+    args = ap.parse_args()
+    if pre.config and not args.all_configs and not args.search_sweep:
+        for path in CONFIGS[pre.config].get("skip", []):
+            if path not in args.skip:
+                args.skip.append(path)
+
+    conn = db.connect(args.db)
+
+    if args.render_only:
+        run_id = args.run or db.latest_run_id(conn)
+        if run_id is None:
+            sys.exit(f"--render-only: no runs found in {args.db}")
+        results = db.load_results(conn, run_id)
+        if not results:
+            sys.exit(f"--render-only: no results found for run {run_id} in {args.db}")
+        print(f"rendering run {run_id} ({len(results)} results)")
+    else:
+        mode = (
+            ("ur-test-fast" if args.no_sweep else "ur-test") if args.ur_test
+            else "search-sweep" if args.search_sweep
+            else "all-configs"  if args.all_configs
+            else "single"
+        )
+        run_id = _record_run(conn, mode)
+        print(f"run_id: {run_id}  →  {args.db}")
+
+        if args.ur_test:
+            results = run_ur_test(args, conn, run_id)
+        elif args.search_sweep:
+            results = []
+            # Base settings come from --config (default: llama-8b) or bare CLI args.
+            base = (
+                _settings_for_config(args.config, args)
+                if args.config
+                else _settings_for_config("llama-8b", args)
+            )
+            sweep_skip = set(args.skip) | {"python_baseline", "python_torch_compile"}
+            for i, n in enumerate(SEARCH_SWEEP_ITERS):
+                if i > 0:
+                    print(f"  [cooldown 20s — letting CUDA free previous model memory]", flush=True)
+                    time.sleep(20)
+                print(f"\n{'='*60}\nSEARCH SWEEP: s={n}\n{'='*60}", flush=True)
+                s = {**base, "search_iters": n}
+                rs = run_one_config(f"s={n}", s, list(sweep_skip))
+                for r in rs:
+                    db.insert_result(conn, run_id, r)
+                conn.commit()
+                results.extend(rs)
+        elif args.all_configs:
+            results = []
+            for config_name in CONFIGS:
+                if config_name in args.skip_configs:
+                    continue
+                print(f"\n{'='*60}\nCONFIG: {config_name}\n{'='*60}", flush=True)
+                settings = _settings_for_config(config_name, args)
+                rs = run_one_config(config_name, settings, args.skip)
+                for r in rs:
+                    db.insert_result(conn, run_id, r)
+                conn.commit()
+                results.extend(rs)
+        else:
+            config_name = args.config or "default"
+            settings = (
+                _settings_for_config(args.config, args)
+                if args.config
+                else _settings_from_args(args)
+            )
+            results = run_one_config(config_name, settings, args.skip)
+            for r in results:
+                db.insert_result(conn, run_id, r)
+            conn.commit()
+
+    # Summary
+    configs_in_results = list(dict.fromkeys(r.get("config", "default") for r in results))
+    for cfg in configs_in_results:
+        group = [r for r in results if r.get("config", "default") == cfg]
+        print(f"\nSummary ({cfg}):")
+        for r in group:
+            if r.get("error"):
+                print(f"  {r['path']:>22}:  FAILED — {r['error']}")
+                continue
+            if r.get("ttft_ms") is None:
+                print(f"  {r['path']:>22}:  no data")
+                continue
+            compile_ms = r.get("compile_ms")
+            compile_str = f"  compile {compile_ms:.0f} ms" if compile_ms is not None else ""
+            tpot = r.get("tpot_ms")
+            tput = r.get("throughput_tps")
+            tpot_str = f"  TPOT {tpot:.2f} ms  ({tput:.1f} tok/s)" if tpot is not None else ""
+            print(f"  {r['path']:>22}:  TTFT {r['ttft_ms']:.2f} ms{compile_str}{tpot_str}")
+
+    plot(results, args.out)
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks/ttft/run.sh
+++ b/benchmarks/ttft/run.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+# TTFT benchmark entrypoint. Runs via uv against the luminal_python venv.
+set -e
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+REPO_ROOT="$( cd "$SCRIPT_DIR/../.." && pwd )"
+cd "$REPO_ROOT/crates/luminal_python"
+exec uv run python "$SCRIPT_DIR/run.py" "$@"
--- a/benchmarks/ttft/ttft.png
+++ b/benchmarks/ttft/ttft.png
--- a/ci/example_output.py
+++ b/ci/example_output.py
@@ -1,85 +0,0 @@
-import re
-
-ANSI_ESCAPE = re.compile(r"\x1b\[[0-?]*[ -/]*[@-~]")
-
-EXPECTED_OUTPUT = {
-    "gemma4_moe": [
-        "city of romance, art and culture",
-    ],
-    "whisper": [
-        "ask not what your country can do for you",
-    ],
-}
-
-EXPECTED_CONCEPTS = {
-    "llama": [
-        ["layers"],
-        ["neurons", "nodes"],
-        ["learn", "learning", "adapt"],
-        ["data", "patterns", "features"],
-    ],
-    "gemma": [
-        ["neural network", "neural networks"],
-        ["nodes", "neurons"],
-        ["layers"],
-        ["weights"],
-        ["training", "learn", "learns"],
-    ],
-    "qwen": [
-        ["neural network", "neural networks"],
-        ["computational model", "computational system"],
-        ["brain"],
-        ["layers"],
-        ["neurons", "nodes"],
-        ["learn", "learning", "training"],
-    ],
-    "qwen3_moe": [
-        ["capital"],
-        ["france"],
-        ["paris"],
-    ],
-}
-
-
-def normalize_output(output: str) -> str:
-    output = ANSI_ESCAPE.sub("", output)
-    output = output.replace("\r", "\n")
-    return re.sub(r"\s+", " ", output).casefold()
-
-
-def validate_output(example: str, output: str):
-    normalized_output = normalize_output(output)
-
-    expected_concepts = EXPECTED_CONCEPTS.get(example)
-    if expected_concepts is not None:
-        missing = [
-            concept_group
-            for concept_group in expected_concepts
-            if not any(normalize_output(term) in normalized_output for term in concept_group)
-        ]
-        if missing:
-            expected = "\n  - ".join(" / ".join(group) for group in expected_concepts)
-            missing_terms = "\n  - ".join(" / ".join(group) for group in missing)
-            raise AssertionError(
-                f"Output check failed for {example!r}.\n"
-                f"Expected concept groups:\n  - {expected}\n"
-                f"Missing concept groups:\n  - {missing_terms}"
-            )
-
-        expected = ", ".join(" / ".join(group) for group in expected_concepts)
-        print(f"\nOutput check passed for {example!r}: found concepts {expected}")
-        return
-
-    expected_phrases = EXPECTED_OUTPUT.get(example)
-    if expected_phrases is None:
-        raise ValueError(f"No expected output phrases configured for example {example!r}")
-
-    for phrase in expected_phrases:
-        if normalize_output(phrase) in normalized_output:
-            print(f"\nOutput check passed for {example!r}: found {phrase!r}")
-            return
-
-    expected = "\n  - ".join(expected_phrases)
-    raise AssertionError(
-        f"Output check failed for {example!r}. Expected one of:\n  - {expected}"
-    )
--- a/ci/metal_qwen_example.py
+++ b/ci/metal_qwen_example.py
@@ -1,46 +0,0 @@
-import os
-import subprocess
-import sys
-
-from example_output import validate_output
-
-def run_and_capture(command: list[str], *, cwd: str, env: dict[str, str]) -> str:
-    process = subprocess.Popen(
-        command,
-        cwd=cwd,
-        env=env,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.STDOUT,
-    )
-    assert process.stdout is not None
-
-    chunks = []
-    while True:
-        chunk = process.stdout.read1(4096)
-        if not chunk:
-            break
-        sys.stdout.buffer.write(chunk)
-        sys.stdout.buffer.flush()
-        chunks.append(chunk)
-
-    return_code = process.wait()
-    output = b"".join(chunks).decode("utf-8", errors="replace")
-    if return_code:
-        raise subprocess.CalledProcessError(return_code, command, output=output)
-    return output
-
-
-def main():
-    repo_root = os.environ.get("GITHUB_WORKSPACE", os.getcwd())
-    output = run_and_capture(
-        ["cargo", "run", "--release", "-p", "qwen", "--features", "metal"],
-        cwd=repo_root,
-        env=os.environ.copy(),
-    )
-    if "TTFT:" not in output or "TPOT:" not in output:
-        raise AssertionError("qwen Metal example did not complete generation")
-    validate_output("qwen", output)
-
-
-if __name__ == "__main__":
-    main()
--- a/ci/modal_cargo_test.py
+++ b/ci/modal_cargo_test.py
@@ -28,7 +28,7 @@ cuda_image = (
@app.function(
    image=cuda_image,
    gpu=gpu_type,
-    timeout=7200,  # 2 hours
+    timeout=1800,  # 30 minutes
 )
 def run_cargo_test():
    """Run cargo test for luminal_cuda_lite on a Modal GPU."""
@@ -47,7 +47,6 @@ def run_cargo_test():
        [
            "cargo",
            "test",
-            "--release",
            "-p",
            "luminal_cuda_lite",
            "--verbose",
--- a/ci/modal_example.py
+++ b/ci/modal_example.py
@@ -1,8 +1,6 @@
-import os
-import subprocess
-import sys
-
 import modal
+import subprocess
+import os

 example = os.environ.get("EXAMPLE", "llama")
 gpu_type = os.environ.get("GPU_TYPE", "A100-80GB")
@@ -20,37 +18,6 @@ hf_cache = modal.Volume.from_name(

 WORKDIR = "/workspace/luminal"

-EXAMPLE_CARGO_ARGS = {
-    "qwen": ["--features", "cuda"],
-}
-
-
-def run_and_capture(command: list[str], *, cwd: str, env: dict[str, str]) -> str:
-    process = subprocess.Popen(
-        command,
-        cwd=cwd,
-        env=env,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.STDOUT,
-    )
-    assert process.stdout is not None
-
-    chunks = []
-    while True:
-        chunk = process.stdout.read1(4096)
-        if not chunk:
-            break
-        sys.stdout.buffer.write(chunk)
-        sys.stdout.buffer.flush()
-        chunks.append(chunk)
-
-    return_code = process.wait()
-    output = b"".join(chunks).decode("utf-8", errors="replace")
-    if return_code:
-        raise subprocess.CalledProcessError(return_code, command, output=output)
-    return output
-
-
 cuda_image = (
    modal.Image.from_registry(
        "nvcr.io/nvidia/pytorch:25.03-py3"
@@ -72,7 +39,7 @@ cuda_image = (
@app.function(
    image=cuda_image,
    gpu=gpu_type,
-    timeout=7200,  # 2 hours
+    timeout=3600,  # 60 minutes
    volumes={
        HF_CACHE_PATH: hf_cache,
    },
@@ -80,20 +47,17 @@ cuda_image = (
 def run_example(example: str):
    """Build and run a luminal example on a Modal GPU."""
    subprocess.run(["nvidia-smi"], check=True)
-    sys.path.insert(0, f"{WORKDIR}/ci")
-    from example_output import validate_output

-    run_env = {
-        **os.environ,
-        "CUDARC_CUDA_VERSION": CUDARC_CUDA_VERSION,
-        "HF_HOME": HF_CACHE_PATH,
-    }
-    output = run_and_capture(
-        ["cargo", "run", "--release", *EXAMPLE_CARGO_ARGS.get(example, [])],
+    subprocess.run(
+        ["cargo", "run", "--release"],
        cwd=f"{WORKDIR}/examples/{example}",
-        env=run_env,
+        env={
+            **os.environ,
+            "CUDARC_CUDA_VERSION": CUDARC_CUDA_VERSION,
+            "HF_HOME": HF_CACHE_PATH,
+        },
+        check=True,
    )
-    validate_output(example, output)

    hf_cache.commit()

--- a/crates/luminal_cuda_lite/Cargo.toml
+++ b/crates/luminal_cuda_lite/Cargo.toml
@@ -10,8 +10,7 @@ license = "MIT OR Apache-2.0"
 [dependencies]
 luminal = { path = "../.." }
 luminal_tracing = { path = "../luminal_tracing" }
-cudarc = {version="0.19.4", features=["cuda-version-from-build-system", "fallback-latest"]}
-anyhow = "1.0"
+cudarc = {version="0.18.2", features=["cuda-version-from-build-system", "fallback-latest"]}
 as-any = "0.3.2"
 itertools = "0.12.1"
 fixedbitset = "0.5.7"
@@ -24,12 +23,10 @@ memmap2 = "0.9.9"
 uuid = {version="1.19.0", features=["v4"]}
 lru = "0.16.2"
 libc = "0.2"
-libloading = "0.8"
 colorize = "*"

 [dev-dependencies]
 candle-core = { version = "0.9.2", features = ["cuda"] }
-luminal_nn = { path = "../luminal_nn" }
 proptest = "1.9.0"
 rand = "0.9.2"
 tracing-subscriber = { version = "0.3", features = ["env-filter"] }
--- a/crates/luminal_cuda_lite/examples/egglog_saturation.rs
+++ b/crates/luminal_cuda_lite/examples/egglog_saturation.rs
@@ -1,611 +0,0 @@
-use std::{collections::BTreeMap, sync::Arc, time::Instant};
-
-use itertools::Itertools;
-use luminal::prelude::egglog::{ast::Span, prelude::RustSpan};
-use luminal::{
-    dtype::DType,
-    egglog_utils::{
-        base::{base_cleanup_egglog, base_expression_egglog},
-        hlir_to_egglog,
-    },
-    hlir::HLIROps,
-    op::{EgglogOp, IntoEgglogOp, Runtime},
-    prelude::*,
-    shape::Expression,
-};
-use luminal_cuda_lite::runtime::CudaRuntime;
-
-const DEFAULT_PASSES: usize = 256;
-const EGGLOG_RULESETS: &[&str] = &[
-    "matmul_flatten",
-    "kernel_lower",
-    "direct_kernel",
-    "kernel_specialize",
-    "buffer_reuse",
-    "matmul_backend",
-    "glumoe",
-    "fusion_pair",
-    "fusion_grow",
-    "fusion_merge",
-];
-const MOE_SEQ: usize = 2;
-const MOE_HIDDEN: usize = 16;
-const MOE_NUM_EXPERTS: usize = 8;
-const MOE_TOP_K: usize = 2;
-const MOE_INTERMEDIATE: usize = 6;
-const GEMMA_RMS_NORM_EPS: f32 = 1e-6;
-
-#[derive(Debug, Clone, Copy)]
-enum Backend {
-    Native,
-    Cuda,
-}
-
-#[derive(Debug, Clone, Copy)]
-enum Mode {
-    Current,
-    Steps,
-    FullDefault,
-    FullCycle,
-}
-
-#[derive(Debug, Clone, Copy)]
-enum Case {
-    Mul,
-    UnaryChain(usize),
-    Gelu,
-    Softmax,
-    LayerNorm,
-    Matmul,
-    Attention,
-    QwenMoe,
-    GemmaMoe,
-}
-
-#[derive(Debug)]
-struct Args {
-    backend: Backend,
-    mode: Mode,
-    case: Case,
-    passes: usize,
-    cleanup: bool,
-    skip_roll: bool,
-}
-
-fn parse_args() -> Args {
-    let mut args = Args {
-        backend: Backend::Cuda,
-        mode: Mode::Current,
-        case: Case::Gelu,
-        passes: DEFAULT_PASSES,
-        cleanup: true,
-        skip_roll: false,
-    };
-
-    let mut iter = std::env::args().skip(1);
-    while let Some(arg) = iter.next() {
-        match arg.as_str() {
-            "--backend" => {
-                args.backend = match iter.next().as_deref() {
-                    Some("native") => Backend::Native,
-                    Some("cuda") => Backend::Cuda,
-                    other => panic!("invalid --backend {other:?}; use native|cuda"),
-                };
-            }
-            "--mode" => {
-                args.mode = match iter.next().as_deref() {
-                    Some("current") => Mode::Current,
-                    Some("steps") => Mode::Steps,
-                    Some("full-default") => Mode::FullDefault,
-                    Some("full-cycle") => Mode::FullCycle,
-                    other => panic!(
-                        "invalid --mode {other:?}; use current|steps|full-default|full-cycle"
-                    ),
-                };
-            }
-            "--case" => {
-                args.case = parse_case(&iter.next().expect("missing --case value"));
-            }
-            "--passes" => {
-                args.passes = iter
-                    .next()
-                    .expect("missing --passes value")
-                    .parse()
-                    .expect("invalid --passes value");
-            }
-            "--no-cleanup" => args.cleanup = false,
-            "--skip-roll" => args.skip_roll = true,
-            "--help" | "-h" => {
-                println!(
-                    "Usage: egglog_saturation [OPTIONS]\n\
-                     \n\
-                     Options:\n\
-                       --backend native|cuda          default: cuda\n\
-                       --mode current|steps|full-default|full-cycle\n\
-                       --case mul|unary-chain:N|gelu|softmax|layer-norm|matmul|attention|qwen-moe|gemma-moe\n\
-                       --passes N                    default: 256\n\
-                       --no-cleanup                  omit backend/HLIR cleanup rules\n\
-                       --skip-roll                   skip auto loop rolling prepass"
-                );
-                std::process::exit(0);
-            }
-            other => panic!("unknown argument {other}; use --help"),
-        }
-    }
-
-    args
-}
-
-fn parse_case(s: &str) -> Case {
-    if let Some(n) = s.strip_prefix("unary-chain:") {
-        return Case::UnaryChain(n.parse().expect("invalid unary-chain length"));
-    }
-    match s {
-        "mul" => Case::Mul,
-        "gelu" => Case::Gelu,
-        "softmax" => Case::Softmax,
-        "layer-norm" | "layer_norm" => Case::LayerNorm,
-        "matmul" => Case::Matmul,
-        "attention" => Case::Attention,
-        "qwen-moe" | "qwen_moe" => Case::QwenMoe,
-        "gemma-moe" | "gemma_moe" => Case::GemmaMoe,
-        other => panic!("unknown case {other}"),
-    }
-}
-
-fn build_case(case: Case) -> Graph {
-    let mut cx = Graph::new();
-    let out = match case {
-        Case::Mul => {
-            let x = cx.tensor((64, 64));
-            x * x
-        }
-        Case::UnaryChain(n) => {
-            let mut x = cx.tensor((64, 64));
-            for i in 0..n {
-                x = match i % 6 {
-                    0 => x.sin(),
-                    1 => x.sqrt(),
-                    2 => x.reciprocal(),
-                    3 => x.exp2(),
-                    4 => x.log2(),
-                    _ => x * 1.125,
-                };
-            }
-            x
-        }
-        Case::Gelu => cx.tensor((64, 64)).gelu(),
-        Case::Softmax => cx.tensor((128, 128)).softmax(1),
-        Case::LayerNorm => cx.tensor((128, 128)).layer_norm(1, 1e-5),
-        Case::Matmul => {
-            let a = cx.tensor((32, 64));
-            let b = cx.tensor((64, 32));
-            a.matmul(b)
-        }
-        Case::Attention => {
-            let q = cx.tensor((64, 32));
-            let k = cx.tensor((64, 32));
-            let v = cx.tensor((64, 32));
-            let scores = q.matmul(k.permute((1, 0))) * (1.0 / 32.0_f32.sqrt());
-            scores.softmax(1).matmul(v)
-        }
-        Case::QwenMoe => build_qwen_moe(&mut cx),
-        Case::GemmaMoe => build_gemma_moe(&mut cx),
-    };
-    let _ = out.output();
-    cx
-}
-
-fn build_qwen_moe(cx: &mut Graph) -> GraphTensor {
-    cx.set_dim('s', MOE_SEQ);
-    let x = cx.tensor(('s', MOE_HIDDEN));
-    let router = cx.tensor((MOE_NUM_EXPERTS, MOE_HIDDEN));
-    let gate_up_weights = cx
-        .tensor((MOE_NUM_EXPERTS, MOE_INTERMEDIATE * 2, MOE_HIDDEN))
-        .as_dtype(DType::Bf16);
-    let down_weights = cx
-        .tensor((MOE_NUM_EXPERTS, MOE_HIDDEN, MOE_INTERMEDIATE))
-        .as_dtype(DType::Bf16);
-
-    let n = x.dims().len();
-    let e_dim = *router.dims().first().unwrap();
-    let k_expr = Expression::from(MOE_TOP_K);
-
-    let routing_weights = x.matmul(router.t()).softmax(n - 1);
-    let top_k_indices = routing_weights.topk_indexes(MOE_TOP_K, n - 1);
-    let row_offsets = x
-        .graph()
-        .iota(Expression::from('z') / k_expr * e_dim, top_k_indices.dims());
-    let routing_flat_idx = row_offsets + top_k_indices;
-    let top_k_values = routing_weights.gather(routing_flat_idx);
-
-    let gate_up_gathered = gather_experts(x, top_k_indices, gate_up_weights).cast(DType::F32);
-    let x_exp = x.expand_dim(n - 1, MOE_TOP_K).unsqueeze(n);
-    let gate_up_out = x_exp.matmul(gate_up_gathered.transpose(2, 3)).squeeze(n);
-    let gate = gate_up_out.slice((.., .., ..MOE_INTERMEDIATE));
-    let up = gate_up_out.slice((.., .., MOE_INTERMEDIATE..));
-    let hidden = gate.silu() * up;
-
-    let down_gathered = gather_experts(x, top_k_indices, down_weights).cast(DType::F32);
-    let down_out = hidden
-        .unsqueeze(2)
-        .matmul(down_gathered.transpose(2, 3))
-        .squeeze(2);
-    let mut weights_exp = top_k_values.unsqueeze(top_k_values.dims().len());
-    weights_exp.shape.expand(down_out.dims());
-    (down_out * weights_exp).sum(n - 1)
-}
-
-fn build_gemma_moe(cx: &mut Graph) -> GraphTensor {
-    cx.set_dim('s', MOE_SEQ);
-    let router_input = cx.tensor(('s', MOE_HIDDEN));
-    let expert_input = cx.tensor(('s', MOE_HIDDEN));
-    let router_scale = cx.tensor(MOE_HIDDEN);
-    let router_proj = cx.tensor((MOE_NUM_EXPERTS, MOE_HIDDEN));
-    let per_expert_scale = cx.tensor(MOE_NUM_EXPERTS);
-    let gate_up_weights = cx
-        .tensor((MOE_NUM_EXPERTS, MOE_INTERMEDIATE * 2, MOE_HIDDEN))
-        .as_dtype(DType::Bf16);
-    let down_weights = cx
-        .tensor((MOE_NUM_EXPERTS, MOE_HIDDEN, MOE_INTERMEDIATE))
-        .as_dtype(DType::Bf16);
-
-    let n = router_input.dims().len();
-    let e_dim = *router_proj.dims().first().unwrap();
-    let k_expr = Expression::from(MOE_TOP_K);
-
-    let router_hidden = router_input.std_norm(n - 1, GEMMA_RMS_NORM_EPS)
-        * router_scale.expand_lhs(&router_input.dims()[..n - 1])
-        * (MOE_HIDDEN as f32).sqrt().recip();
-    let routing_weights = router_hidden.matmul(router_proj.t()).softmax(n - 1);
-    let top_k_indices = routing_weights.topk_indexes(MOE_TOP_K, n - 1);
-    let row_offsets = router_input
-        .graph()
-        .iota(Expression::from('z') / k_expr * e_dim, top_k_indices.dims());
-    let routing_flat_idx = row_offsets + top_k_indices;
-    let top_k_values = routing_weights.gather(routing_flat_idx);
-    let top_k_norm = top_k_values.sum(n - 1).expand_dim(n - 1, MOE_TOP_K);
-    let top_k_weights = (top_k_values / top_k_norm) * per_expert_scale.gather(top_k_indices);
-
-    let gate_up_gathered =
-        gather_experts(expert_input, top_k_indices, gate_up_weights).cast(DType::F32);
-    let x_exp = expert_input.expand_dim(n - 1, MOE_TOP_K).unsqueeze(n);
-    let gate_up_out = x_exp.matmul(gate_up_gathered.transpose(2, 3)).squeeze(n);
-    let gate = gate_up_out.slice((.., .., ..MOE_INTERMEDIATE));
-    let up = gate_up_out.slice((.., .., MOE_INTERMEDIATE..));
-    let hidden = gemma_gelu(gate) * up;
-
-    let down_gathered = gather_experts(expert_input, top_k_indices, down_weights).cast(DType::F32);
-    let down_out = hidden
-        .unsqueeze(2)
-        .matmul(down_gathered.transpose(2, 3))
-        .squeeze(2);
-    let mut weights_exp = top_k_weights.unsqueeze(top_k_weights.dims().len());
-    weights_exp.shape.expand(down_out.dims());
-    (down_out * weights_exp).sum(n - 1)
-}
-
-fn gather_experts(
-    graph_source: GraphTensor,
-    top_k_indices: GraphTensor,
-    weights: GraphTensor,
-) -> GraphTensor {
-    let (_, d1, d2) = weights.dims3();
-    let io = d1 * d2;
-    let base = top_k_indices * io;
-    let within = graph_source.graph().iota(Expression::from('z'), (d1, d2));
-    let n_base = base.dims().len();
-    let exp_base = base.expand_dim(n_base, d1).expand_dim(n_base + 1, d2);
-    let mut exp_within = within;
-    for (axis, dim) in base.dims().iter().enumerate() {
-        exp_within = exp_within.expand_dim(axis, *dim);
-    }
-    weights.gather(exp_base + exp_within)
-}
-
-#[allow(clippy::excessive_precision)]
-fn gemma_gelu(x: GraphTensor) -> GraphTensor {
-    let scaled = 1.5957691216 * x * (1. + 0.044715 * x * x);
-    x * scaled.sigmoid()
-}
-
-fn op_defs_string(ops: &[Arc<Box<dyn EgglogOp>>]) -> String {
-    let mut ir_variants = Vec::new();
-    let mut opkind_variants = Vec::new();
-    for op in ops {
-        let sort = op.sort();
-        let variant = format!(
-            "({} {})",
-            sort.name,
-            sort.fields.iter().map(|field| &field.sort).join(" ")
-        );
-        match sort.class.as_str() {
-            "IR" => ir_variants.push(variant),
-            "OpKind" => opkind_variants.push(variant),
-            other => panic!("unknown sort class {other} for {}", sort.name),
-        }
-    }
-    let extra_ir = ops.iter().flat_map(|op| op.ir_defs()).unique().join("\n");
-    format!(
-        "
-(datatype*
-    (IR
-        (OutputJoin IR IR)
-        (Op OpKind IList)
-        {extra_ir}
-        {}
-    )
-    (OpKind
-        {}
-    )
-    (IList
-        (ICons IR IList)
-        (INil)
-    )
-)
-(function dtype (IR) DType :merge new)
-",
-        ir_variants.join("\n"),
-        opkind_variants.join("\n")
-    )
-}
-
-fn op_cleanups_string(ops: &[Arc<Box<dyn EgglogOp>>]) -> String {
-    ops.iter()
-        .filter(|op| op.cleanup())
-        .map(|op| {
-            let sort = op.sort();
-            let fields = (0..sort.fields.len())
-                .map(|i| (b'a' + i as u8) as char)
-                .join(" ");
-            if sort.class == "OpKind" {
-                format!(
-                    "(rule
-                       ((= ?m (Op ({} {fields}) ?__cleanup_inputs)))
-                       ((delete (Op ({} {fields}) ?__cleanup_inputs)))
-                       :ruleset cleanup)",
-                    sort.name, sort.name
-                )
-            } else {
-                format!(
-                    "(rule
-                       ((= ?m ({} {fields})))
-                       ((delete ({} {fields})))
-                       :ruleset cleanup)",
-                    sort.name, sort.name
-                )
-            }
-        })
-        .join("\n")
-}
-
-fn setup_program(program: &str, ops: &[Arc<Box<dyn EgglogOp>>], cleanup: bool) -> String {
-    let rewrites = ops
-        .iter()
-        .flat_map(|op| op.rewrites())
-        .map(|rule| rule.to_egglog_string())
-        .join("\n");
-    [
-        EGGLOG_RULESETS
-            .iter()
-            .map(|ruleset| format!("(ruleset {ruleset})"))
-            .join("\n"),
-        base_expression_egglog(),
-        op_defs_string(ops),
-        if cleanup {
-            op_cleanups_string(ops)
-        } else {
-            String::new()
-        },
-        base_cleanup_egglog(),
-        rewrites,
-        program.to_string(),
-    ]
-    .join("\n")
-}
-
-fn producer_schedule() -> String {
-    "(seq
-        (saturate expr)
-        (saturate dtype_prop)
-        (run matmul_flatten)
-        (run kernel_lower)
-        (run direct_kernel)
-        (run kernel_specialize)
-        (run buffer_reuse)
-        (run matmul_backend)
-        (run glumoe)
-        (run fusion_pair)
-    )"
-    .to_string()
-}
-
-fn fusion_schedule() -> String {
-    "(seq
-        (saturate expr)
-        (saturate dtype_prop)
-        (run fusion_grow)
-        (run fusion_merge)
-    )"
-    .to_string()
-}
-
-fn split_cycle() -> Vec<(&'static str, String)> {
-    vec![
-        ("producers", format!("(saturate {})", producer_schedule())),
-        ("fusion", format!("(saturate {})", fusion_schedule())),
-    ]
-}
-
-fn split_cycle_schedule() -> String {
-    format!(
-        "(seq
-            (saturate {})
-            (saturate {})
-        )",
-        producer_schedule(),
-        fusion_schedule()
-    )
-}
-
-fn phase(egraph: &mut egglog::EGraph, name: &str, schedule: &str) -> bool {
-    let before = egraph.num_tuples();
-    let start = Instant::now();
-    let command = format!("(run-schedule {schedule})");
-    let outputs = egraph
-        .parse_and_run_program(None, &command)
-        .unwrap_or_else(|err| panic!("failed phase {name} schedule {schedule}: {err}"));
-    let elapsed = start.elapsed();
-    let after = egraph.num_tuples();
-    let report = outputs
-        .into_iter()
-        .find_map(|output| match output {
-            egglog::CommandOutput::RunSchedule(report) => Some(report),
-            _ => None,
-        })
-        .expect("run-schedule did not return a report");
-    let mut rules = report
-        .search_and_apply_time_per_rule
-        .iter()
-        .map(|(rule, time)| {
-            (
-                rule.to_string(),
-                *time,
-                report
-                    .num_matches_per_rule
-                    .get(rule)
-                    .copied()
-                    .unwrap_or_default(),
-            )
-        })
-        .collect_vec();
-    rules.sort_by_key(|(_, time, matches)| (std::cmp::Reverse(*time), std::cmp::Reverse(*matches)));
-    let matches = report.num_matches_per_rule.values().sum::<usize>();
-    println!(
-        "phase {name:<18} {elapsed_ms:>8.2} ms | tuples {before} -> {after} ({delta:+}) | updated={updated} | iters={iters} | matches={matches}",
-        elapsed_ms = elapsed.as_secs_f64() * 1000.0,
-        delta = after as isize - before as isize,
-        updated = report.updated,
-        iters = report.iterations.len(),
-    );
-    for (rule, time, matches) in rules
-        .into_iter()
-        .filter(|(_, time, matches)| !time.is_zero() || *matches > 0)
-        .take(8)
-    {
-        println!(
-            "  rule {rule:<82} {ms:>8.2} ms | matches {matches}",
-            ms = time.as_secs_f64() * 1000.0,
-        );
-    }
-    report.updated
-}
-
-fn serialize_summary(egraph: &mut egglog::EGraph, root: &str) {
-    let (sort, value) = egraph.eval_expr(&egglog::var!(root.to_string())).unwrap();
-    let output = egraph.serialize(egglog::SerializeConfig {
-        root_eclasses: vec![(sort, value)],
-        max_functions: None,
-        include_temporary_functions: false,
-        max_calls_per_function: None,
-    });
-    let mut classes = std::collections::BTreeSet::new();
-    let mut top_ops = BTreeMap::<String, usize>::new();
-    let mut nodes = 0usize;
-    for node in output.egraph.nodes.values().filter(|node| !node.subsumed) {
-        nodes += 1;
-        classes.insert(node.eclass.clone());
-        *top_ops.entry(node.op.clone()).or_default() += 1;
-    }
-    let top_ops = top_ops
-        .into_iter()
-        .sorted_by_key(|(_, count)| std::cmp::Reverse(*count))
-        .take(12)
-        .map(|(op, count)| format!("{op}={count}"))
-        .join(", ");
-    println!(
-        "serialize nodes={nodes} classes={} roots={} top_ops={top_ops}",
-        classes.len(),
-        output.egraph.root_eclasses.len()
-    );
-}
-
-fn run(args: Args) {
-    let mut graph = build_case(args.case);
-    let rolled = if args.skip_roll {
-        0
-    } else {
-        graph.auto_roll_loops_prepass()
-    };
-    let (program, root) = hlir_to_egglog(&graph);
-
-    let mut ops = match args.backend {
-        Backend::Native => <NativeRuntime as Runtime>::Ops::into_vec(),
-        Backend::Cuda => <CudaRuntime as Runtime>::Ops::into_vec(),
-    };
-    ops.extend(<HLIROps as IntoEgglogOp>::into_vec());
-    let cleanup = args.cleanup && matches!(args.backend, Backend::Cuda);
-    let setup = setup_program(&program, &ops, cleanup);
-
-    println!(
-        "case={:?} backend={:?} mode={:?} passes={} cleanup={} rolled={} hlir_nodes={} setup_lines={} setup_bytes={} root={root}",
-        args.case,
-        args.backend,
-        args.mode,
-        args.passes,
-        cleanup,
-        rolled,
-        graph.graph.node_count(),
-        setup.lines().count(),
-        setup.len(),
-    );
-
-    let mut egraph = egglog::EGraph::default();
-    let before = egraph.num_tuples();
-    let start = Instant::now();
-    let commands = egraph.parser.get_program_from_string(None, &setup).unwrap();
-    egraph.run_program(commands).unwrap();
-    println!(
-        "setup {:>8.2} ms | tuples {before} -> {} ({:+})",
-        start.elapsed().as_secs_f64() * 1000.0,
-        egraph.num_tuples(),
-        egraph.num_tuples() as isize - before as isize,
-    );
-
-    match args.mode {
-        Mode::Current | Mode::Steps => {
-            for pass in 1..=args.passes {
-                let mut updated = false;
-                for (name, schedule) in split_cycle() {
-                    updated |= phase(&mut egraph, &format!("{pass:03} {name}"), &schedule);
-                }
-                if matches!(args.mode, Mode::Current) && !updated {
-                    break;
-                }
-            }
-        }
-        Mode::FullDefault => {
-            phase(&mut egraph, "expr", "(saturate expr)");
-            phase(&mut egraph, "dtype", "(saturate dtype_prop)");
-            phase(&mut egraph, "default-full", "(saturate (run))");
-        }
-        Mode::FullCycle => {
-            phase(
-                &mut egraph,
-                "cycle-full",
-                &format!("(saturate {})", split_cycle_schedule()),
-            );
-        }
-    }
-
-    phase(&mut egraph, "final expr", "(saturate expr)");
-    if cleanup {
-        phase(&mut egraph, "cleanup", "(saturate cleanup)");
-    }
-    phase(&mut egraph, "base cleanup", "(saturate base_cleanup)");
-    serialize_summary(&mut egraph, &root);
-}
-
-fn main() {
-    run(parse_args());
-}
--- a/crates/luminal_cuda_lite/src/host/cublas/mod.rs
+++ b/crates/luminal_cuda_lite/src/host/cublas/mod.rs
@@ -19,9 +19,9 @@ use crate::{
            CudaBlas,
            sys::{cublasOperation_t, cublasSetStream_v2, cublasSgemm_v2, cublasStatus_t},
        },
-        driver::CudaStream,
+        driver::{CudaSlice, CudaStream, DevicePtr},
    },
-    host::{DeviceBuffer, HostOp},
+    host::HostOp,
 };

 /// Global shared cuBLAS handle to avoid per-operation workspace allocation
@@ -156,7 +156,7 @@ impl HostOp for CuBlasSgemmV2 {
        stream: &Arc<CudaStream>,
        self_node: NodeIndex,
        inputs: &[NodeIndex],
-        buffers: &FxHashMap<NodeIndex, DeviceBuffer>,
+        buffers: &FxHashMap<NodeIndex, &CudaSlice<u8>>,
        dyn_map: &FxHashMap<char, usize>,
    ) -> anyhow::Result<()> {
        // GEMM parameters
@@ -178,9 +178,9 @@ impl HostOp for CuBlasSgemmV2 {
        let b_buf = buffers[&inputs[1]];

        // Get device pointers
-        let a_ptr = a_buf.ptr();
-        let b_ptr = b_buf.ptr();
-        let c_ptr = c_buf.ptr();
+        let (a_ptr, _a_guard) = a_buf.device_ptr(stream);
+        let (b_ptr, _b_guard) = b_buf.device_ptr(stream);
+        let (c_ptr, _c_guard) = c_buf.device_ptr(stream);

        // Debug: Check buffer sizes
        trace!(
--- a/crates/luminal_cuda_lite/src/host/cublas/sgemm_v2_CmCm_rewrite.egg
+++ b/crates/luminal_cuda_lite/src/host/cublas/sgemm_v2_CmCm_rewrite.egg
@@ -68,6 +68,5 @@
        (union ?sum ?sgemm)
        (set (dtype ?sgemm) (F32))
    )
-    :ruleset matmul_backend
    :name "cublas sgemm column-major × column-major"
 )
--- a/crates/luminal_cuda_lite/src/host/cublas/sgemm_v2_CmRm_rewrite.egg
+++ b/crates/luminal_cuda_lite/src/host/cublas/sgemm_v2_CmRm_rewrite.egg
@@ -68,6 +68,5 @@
        (union ?sum ?sgemm)
        (set (dtype ?sgemm) (F32))
    )
-    :ruleset matmul_backend
    :name "cublas sgemm column-major × row-major"
 )
--- a/crates/luminal_cuda_lite/src/host/cublas/sgemm_v2_RmCm_rewrite.egg
+++ b/crates/luminal_cuda_lite/src/host/cublas/sgemm_v2_RmCm_rewrite.egg
@@ -68,6 +68,5 @@
        (union ?sum ?sgemm)
        (set (dtype ?sgemm) (F32))
    )
-    :ruleset matmul_backend
    :name "cublas sgemm row-major × column-major"
-)
+)
--- a/crates/luminal_cuda_lite/src/host/cublas/sgemm_v2_RmRm_rewrite.egg
+++ b/crates/luminal_cuda_lite/src/host/cublas/sgemm_v2_RmRm_rewrite.egg
@@ -68,6 +68,5 @@
        (union ?sum ?sgemm)
        (set (dtype ?sgemm) (F32))
    )
-    :ruleset matmul_backend
    :name "cublas sgemm row-major"
-)
+)
--- a/crates/luminal_cuda_lite/src/host/cublaslt/cublaslt_CmCm_rewrite.egg
+++ b/crates/luminal_cuda_lite/src/host/cublaslt/cublaslt_CmCm_rewrite.egg
@@ -42,7 +42,6 @@

        (= ?dt (dtype ?a))
        (= ?dt (dtype ?b))
-        (cublaslt_base_dtype ?dt)
    )
    (
        ; For column-major A × column-major B with cuBLAS:
@@ -53,22 +52,18 @@
            ?k             ; k unchanged
            "T"            ; transa = Transpose (B is column-major [k,n], need B^T[n,k])
            "T"            ; transb = Transpose (A is column-major [m,k], need A^T[k,m])
-            "COL" "COL" "COL" "COL" ; A/B/C/D matrix orders
            ?b_n_stride    ; lda = B's column stride (resolves to k after z→1)
            ?a_k_stride    ; ldb = A's column stride (resolves to m after z→1)
            ?n             ; ldc = n (row-major C[m,n] viewed as col-major [n,m])
-            ?n             ; ldd = ldc for current row-major output rewrites
            (MNum 1)       ; batch_count = 1
            (MNum 0)       ; stride_a = 0
            (MNum 0)       ; stride_b = 0
            (MNum 0)       ; stride_c = 0
-            (MNum 0)       ; stride_d = 0
-            ?dt ?dt ?dt ?dt "default" "default" 1.0 0.0 "DEFAULT")   ; type tuple, alpha, beta
+            ?dt)           ; dtype
            (ICons ?b (ICons ?a (INil)))))
        (union ?sum ?sgemm)
        (set (dtype ?sgemm) ?dt)
    )
-    :ruleset matmul_backend
    :name "cublaslt column-major × column-major"
 )

@@ -116,28 +111,23 @@

        (= ?dt (dtype ?a))
        (= ?dt (dtype ?b))
-        (cublaslt_base_dtype ?dt)
    )
    (
        ; cuBLAS: cublas(OP_T, OP_T, n, m, k, B, lda=b_n_stride, A, ldb=a_k_stride, C, ldc=n)
        (let ?sgemm (Op (cublaslt
            ?n ?m ?k
            "T" "T"
-            "COL" "COL" "COL" "COL"
            ?b_n_stride       ; lda (cuBLAS A = our B, column stride)
            ?a_k_stride       ; ldb (cuBLAS B = our A, column stride)
            ?n                ; ldc
-            ?n                ; ldd
            ?batch
            ?b_batch_stride   ; stride_a (cuBLAS A = our B)
            ?a_batch_stride   ; stride_b (cuBLAS B = our A)
            (MMul ?m ?n)      ; stride_c
-            (MMul ?m ?n)      ; stride_d
-            ?dt ?dt ?dt ?dt "default" "default" 1.0 0.0 "DEFAULT")
+            ?dt)
            (ICons ?b (ICons ?a (INil)))))
        (union ?sum ?sgemm)
        (set (dtype ?sgemm) ?dt)
    )
-    :ruleset matmul_backend
    :name "cublaslt batched column-major × column-major"
 )
--- a/crates/luminal_cuda_lite/src/host/cublaslt/cublaslt_CmRm_rewrite.egg
+++ b/crates/luminal_cuda_lite/src/host/cublaslt/cublaslt_CmRm_rewrite.egg
@@ -42,7 +42,6 @@

        (= ?dt (dtype ?a))
        (= ?dt (dtype ?b))
-        (cublaslt_base_dtype ?dt)
    )
    (
        ; For column-major A × row-major B with cuBLAS:
@@ -53,22 +52,18 @@
            ?k             ; k unchanged
            "N"            ; transa = No transpose (B is row-major, viewed as col-major [n,k])
            "T"            ; transb = Transpose (A is column-major [m,k], need A^T[k,m])
-            "COL" "COL" "COL" "COL" ; A/B/C/D matrix orders
            ?b_k_stride    ; lda = B's row stride (resolves to n after z→1)
            ?a_k_stride    ; ldb = A's column stride (resolves to m after z→1)
            ?n             ; ldc = n (row-major C[m,n] viewed as col-major [n,m])
-            ?n             ; ldd = ldc for current row-major output rewrites
            (MNum 1)       ; batch_count = 1
            (MNum 0)       ; stride_a = 0
            (MNum 0)       ; stride_b = 0
            (MNum 0)       ; stride_c = 0
-            (MNum 0)       ; stride_d = 0
-            ?dt ?dt ?dt ?dt "default" "default" 1.0 0.0 "DEFAULT")   ; type tuple, alpha, beta
+            ?dt)           ; dtype
            (ICons ?b (ICons ?a (INil)))))
        (union ?sum ?sgemm)
        (set (dtype ?sgemm) ?dt)
    )
-    :ruleset matmul_backend
    :name "cublaslt column-major × row-major"
 )

@@ -116,28 +111,23 @@

        (= ?dt (dtype ?a))
        (= ?dt (dtype ?b))
-        (cublaslt_base_dtype ?dt)
    )
    (
        ; cuBLAS: cublas(OP_N, OP_T, n, m, k, B, lda=b_k_stride, A, ldb=a_k_stride, C, ldc=n)
        (let ?sgemm (Op (cublaslt
            ?n ?m ?k
            "N" "T"
-            "COL" "COL" "COL" "COL"
            ?b_k_stride       ; lda (cuBLAS A = our B, row stride)
            ?a_k_stride       ; ldb (cuBLAS B = our A, column stride)
            ?n                ; ldc
-            ?n                ; ldd
            ?batch
            ?b_batch_stride   ; stride_a (cuBLAS A = our B)
            ?a_batch_stride   ; stride_b (cuBLAS B = our A)
            (MMul ?m ?n)      ; stride_c
-            (MMul ?m ?n)      ; stride_d
-            ?dt ?dt ?dt ?dt "default" "default" 1.0 0.0 "DEFAULT")
+            ?dt)
            (ICons ?b (ICons ?a (INil)))))
        (union ?sum ?sgemm)
        (set (dtype ?sgemm) ?dt)
    )
-    :ruleset matmul_backend
    :name "cublaslt batched column-major × row-major"
 )
--- a/crates/luminal_cuda_lite/src/host/cublaslt/cublaslt_RmCm_rewrite.egg
+++ b/crates/luminal_cuda_lite/src/host/cublaslt/cublaslt_RmCm_rewrite.egg
@@ -42,7 +42,6 @@

        (= ?dt (dtype ?a))
        (= ?dt (dtype ?b))
-        (cublaslt_base_dtype ?dt)
    )
    (
        ; For row-major A × column-major B with cuBLAS:
@@ -53,22 +52,18 @@
            ?k             ; k unchanged
            "T"            ; transa = Transpose (B is column-major, need B^T)
            "N"            ; transb = No transpose
-            "COL" "COL" "COL" "COL" ; A/B/C/D matrix orders
            ?b_n_stride    ; lda = B's column stride (resolves to k after z→1)
            ?a_m_stride    ; ldb = A's row stride (resolves to k after z→1)
            ?n             ; ldc = n (row-major C[m,n] viewed as col-major [n,m])
-            ?n             ; ldd = ldc for current row-major output rewrites
            (MNum 1)       ; batch_count = 1
            (MNum 0)       ; stride_a = 0
            (MNum 0)       ; stride_b = 0
            (MNum 0)       ; stride_c = 0
-            (MNum 0)       ; stride_d = 0
-            ?dt ?dt ?dt ?dt "default" "default" 1.0 0.0 "DEFAULT")   ; type tuple, alpha, beta
+            ?dt)           ; dtype
            (ICons ?b (ICons ?a (INil)))))
        (union ?sum ?sgemm)
        (set (dtype ?sgemm) ?dt)
    )
-    :ruleset matmul_backend
    :name "cublaslt row-major × column-major"
 )

@@ -116,28 +111,23 @@

        (= ?dt (dtype ?a))
        (= ?dt (dtype ?b))
-        (cublaslt_base_dtype ?dt)
    )
    (
        ; cuBLAS: cublas(OP_T, OP_N, n, m, k, B, lda=b_n_stride, A, ldb=a_m_stride, C, ldc=n)
        (let ?sgemm (Op (cublaslt
            ?n ?m ?k
            "T" "N"
-            "COL" "COL" "COL" "COL"
            ?b_n_stride       ; lda (cuBLAS A = our B, column stride)
            ?a_m_stride       ; ldb (cuBLAS B = our A, row stride)
            ?n                ; ldc
-            ?n                ; ldd
            ?batch
            ?b_batch_stride   ; stride_a (cuBLAS A = our B)
            ?a_batch_stride   ; stride_b (cuBLAS B = our A)
            (MMul ?m ?n)      ; stride_c
-            (MMul ?m ?n)      ; stride_d
-            ?dt ?dt ?dt ?dt "default" "default" 1.0 0.0 "DEFAULT")
+            ?dt)
            (ICons ?b (ICons ?a (INil)))))
        (union ?sum ?sgemm)
        (set (dtype ?sgemm) ?dt)
    )
-    :ruleset matmul_backend
    :name "cublaslt batched row-major × column-major"
 )
--- a/crates/luminal_cuda_lite/src/host/cublaslt/cublaslt_RmRm_rewrite.egg
+++ b/crates/luminal_cuda_lite/src/host/cublaslt/cublaslt_RmRm_rewrite.egg
@@ -42,7 +42,6 @@

        (= ?dt (dtype ?a))
        (= ?dt (dtype ?b))
-        (cublaslt_base_dtype ?dt)
    )
    (
        ; For row-major C = A × B with cuBLAS (column-major):
@@ -53,22 +52,18 @@
            ?k             ; k unchanged
            "N"            ; transa = No transpose
            "N"            ; transb = No transpose
-            "COL" "COL" "COL" "COL" ; A/B/C/D matrix orders
            ?b_k_stride    ; lda = B's row stride (resolves to n after z→1)
            ?a_m_stride    ; ldb = A's row stride (resolves to k after z→1)
            ?n             ; ldc = n (row-major C[m,n] viewed as col-major [n,m])
-            ?n             ; ldd = ldc for current row-major output rewrites
            (MNum 1)       ; batch_count = 1
            (MNum 0)       ; stride_a = 0
            (MNum 0)       ; stride_b = 0
            (MNum 0)       ; stride_c = 0
-            (MNum 0)       ; stride_d = 0
-            ?dt ?dt ?dt ?dt "default" "default" 1.0 0.0 "DEFAULT")   ; type tuple, alpha, beta
+            ?dt)           ; dtype
            (ICons ?b (ICons ?a (INil)))))
        (union ?sum ?sgemm)
        (set (dtype ?sgemm) ?dt)
    )
-    :ruleset matmul_backend
    :name "cublaslt row-major x row-major"
 )

@@ -121,7 +116,6 @@

        (= ?dt (dtype ?a))
        (= ?dt (dtype ?b))
-        (cublaslt_base_dtype ?dt)
    )
    (
        ; cuBLAS swap: C^T[n,m] = B^T[n,k] × A^T[k,m] per batch
@@ -129,21 +123,17 @@
        (let ?sgemm (Op (cublaslt
            ?n ?m ?k
            "N" "N"
-            "COL" "COL" "COL" "COL"
            ?b_k_stride       ; lda (cuBLAS A = our B, row stride)
            ?a_m_stride       ; ldb (cuBLAS B = our A, row stride)
            ?n                ; ldc (contiguous output per batch)
-            ?n                ; ldd
            ?batch            ; batch_count
            ?b_batch_stride   ; stride_a (cuBLAS A = our B)
            ?a_batch_stride   ; stride_b (cuBLAS B = our A)
            (MMul ?m ?n)      ; stride_c
-            (MMul ?m ?n)      ; stride_d
-            ?dt ?dt ?dt ?dt "default" "default" 1.0 0.0 "DEFAULT")
+            ?dt)
            (ICons ?b (ICons ?a (INil)))))
        (union ?sum ?sgemm)
        (set (dtype ?sgemm) ?dt)
    )
-    :ruleset matmul_backend
    :name "cublaslt batched row-major × row-major"
 )
--- a/crates/luminal_cuda_lite/src/host/cublaslt/cublaslt_beta_rewrite.egg
+++ b/crates/luminal_cuda_lite/src/host/cublaslt/cublaslt_beta_rewrite.egg
@@ -1,428 +0,0 @@
-; Fuse a row-major Add on top of an existing cuBLASLt matmul into
-; D = alpha * A * B + beta * C.
-;
-; The existing matmul rewrites view Luminal's row-major output [m,n] as a
-; column-major cuBLASLt matrix [n,m]. A row-major C input with logical strides
-; [row_stride, 1] therefore maps to ldc=row_stride. This lets a C slice from a
-; wider parent tensor use a larger ldc while D keeps the matmul output layout.
-; cuBLASLt requires out-of-place C and D to have the same matrix order, so these
-; beta rules only fuse C layouts that map to the current COL-ordered D layout.
-(rule
-    (
-        (= ?matmul (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order ?matmul_c_order "COL"
-            ?lda ?ldb ?matmul_ldc ?ldd
-            (MNum 1)
-            ?stride_a ?stride_b ?matmul_stride_c ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype ?d_dtype
-            ?compute_type ?scale_dtype
-            ?alpha 0.0 ?epilogue)
-            (ICons ?a (ICons ?b ?matmul_tail))))
-        (!= ?epilogue "RELU")
-        (!= ?epilogue "RELU_BIAS")
-        (!= ?epilogue "GELU")
-        (!= ?epilogue "GELU_BIAS")
-
-        (= ?add (Op (Add
-            (ECons ?n (ECons ?m (ENil)))
-            ?matmul_add_strides
-            ?c_add_strides
-            ?add_out_strides)
-            (ICons ?matmul (ICons ?c (INil)))))
-
-        (= ?matmul_add_strides (ECons ?d_row_stride (ECons ?d_col_stride (ENil))))
-        (= ?c_add_strides (ECons ?c_row_stride (ECons ?c_col_stride (ENil))))
-        (= ?add_out_strides (ECons ?d_row_stride (ECons ?d_col_stride (ENil))))
-        (= ?c_col_stride (MIter))
-        (!= ?c_row_stride (MNum 0))
-        (= ?matmul_add_strides ?add_out_strides)
-        (= ?c_dtype (dtype ?c))
-    )
-    (
-        (let ?fused (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order "COL" "COL"
-            ?lda ?ldb ?c_row_stride ?ldd
-            (MNum 1)
-            ?stride_a ?stride_b (MNum 0) ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype ?d_dtype
-            ?compute_type ?scale_dtype
-            ?alpha 1.0 ?epilogue)
-            (ICons ?a (ICons ?b (ICons ?c ?matmul_tail)))))
-        (union ?add ?fused)
-        (set (dtype ?fused) ?d_dtype)
-    )
-    :ruleset matmul_backend
-    :name "cublaslt 2d matmul plus c beta"
-)
-
-(rule
-    (
-        (= ?matmul (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order ?matmul_c_order "COL"
-            ?lda ?ldb ?matmul_ldc ?ldd
-            (MNum 1)
-            ?stride_a ?stride_b ?matmul_stride_c ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype ?d_dtype
-            ?compute_type ?scale_dtype
-            ?alpha 0.0 ?epilogue)
-            (ICons ?a (ICons ?b ?matmul_tail))))
-        (!= ?epilogue "RELU")
-        (!= ?epilogue "RELU_BIAS")
-        (!= ?epilogue "GELU")
-        (!= ?epilogue "GELU_BIAS")
-
-        (= ?add (Op (Add
-            (ECons ?n (ECons ?m (ENil)))
-            ?c_add_strides
-            ?matmul_add_strides
-            ?add_out_strides)
-            (ICons ?c (ICons ?matmul (INil)))))
-
-        (= ?matmul_add_strides (ECons ?d_row_stride (ECons ?d_col_stride (ENil))))
-        (= ?c_add_strides (ECons ?c_row_stride (ECons ?c_col_stride (ENil))))
-        (= ?add_out_strides (ECons ?d_row_stride (ECons ?d_col_stride (ENil))))
-        (= ?c_col_stride (MIter))
-        (!= ?c_row_stride (MNum 0))
-        (= ?matmul_add_strides ?add_out_strides)
-        (= ?c_dtype (dtype ?c))
-    )
-    (
-        (let ?fused (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order "COL" "COL"
-            ?lda ?ldb ?c_row_stride ?ldd
-            (MNum 1)
-            ?stride_a ?stride_b (MNum 0) ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype ?d_dtype
-            ?compute_type ?scale_dtype
-            ?alpha 1.0 ?epilogue)
-            (ICons ?a (ICons ?b (ICons ?c ?matmul_tail)))))
-        (union ?add ?fused)
-        (set (dtype ?fused) ?d_dtype)
-    )
-    :ruleset matmul_backend
-    :name "cublaslt 2d c plus matmul beta"
-)
-
-(rule
-    (
-        (= ?matmul (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order ?matmul_c_order "COL"
-            ?lda ?ldb ?matmul_ldc ?ldd
-            ?batch
-            ?stride_a ?stride_b ?matmul_stride_c ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype ?d_dtype
-            ?compute_type ?scale_dtype
-            ?alpha 0.0 ?epilogue)
-            (ICons ?a (ICons ?b ?matmul_tail))))
-        (!= ?epilogue "RELU")
-        (!= ?epilogue "RELU_BIAS")
-        (!= ?epilogue "GELU")
-        (!= ?epilogue "GELU_BIAS")
-
-        (= ?add (Op (Add
-            (ECons ?batch (ECons ?n (ECons ?m (ENil))))
-            ?matmul_add_strides
-            ?c_add_strides
-            ?add_out_strides)
-            (ICons ?matmul (ICons ?c (INil)))))
-
-        (= ?matmul_add_strides (ECons ?d_batch_stride (ECons ?d_row_stride (ECons ?d_col_stride (ENil)))))
-        (= ?c_add_strides (ECons ?c_batch_stride (ECons ?c_row_stride (ECons ?c_col_stride (ENil)))))
-        (= ?add_out_strides (ECons ?d_batch_stride (ECons ?d_row_stride (ECons ?d_col_stride (ENil)))))
-        (= ?c_col_stride (MIter))
-        (!= ?c_row_stride (MNum 0))
-        (= ?matmul_add_strides ?add_out_strides)
-        (= ?c_dtype (dtype ?c))
-    )
-    (
-        (let ?fused (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order "COL" "COL"
-            ?lda ?ldb ?c_row_stride ?ldd
-            ?batch
-            ?stride_a ?stride_b ?c_batch_stride ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype ?d_dtype
-            ?compute_type ?scale_dtype
-            ?alpha 1.0 ?epilogue)
-            (ICons ?a (ICons ?b (ICons ?c ?matmul_tail)))))
-        (union ?add ?fused)
-        (set (dtype ?fused) ?d_dtype)
-    )
-    :ruleset matmul_backend
-    :name "cublaslt batched matmul plus c beta"
-)
-
-(rule
-    (
-        (= ?matmul (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order ?matmul_c_order "COL"
-            ?lda ?ldb ?matmul_ldc ?ldd
-            ?batch
-            ?stride_a ?stride_b ?matmul_stride_c ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype ?d_dtype
-            ?compute_type ?scale_dtype
-            ?alpha 0.0 ?epilogue)
-            (ICons ?a (ICons ?b ?matmul_tail))))
-        (!= ?epilogue "RELU")
-        (!= ?epilogue "RELU_BIAS")
-        (!= ?epilogue "GELU")
-        (!= ?epilogue "GELU_BIAS")
-
-        (= ?add (Op (Add
-            (ECons ?batch (ECons ?n (ECons ?m (ENil))))
-            ?c_add_strides
-            ?matmul_add_strides
-            ?add_out_strides)
-            (ICons ?c (ICons ?matmul (INil)))))
-
-        (= ?matmul_add_strides (ECons ?d_batch_stride (ECons ?d_row_stride (ECons ?d_col_stride (ENil)))))
-        (= ?c_add_strides (ECons ?c_batch_stride (ECons ?c_row_stride (ECons ?c_col_stride (ENil)))))
-        (= ?add_out_strides (ECons ?d_batch_stride (ECons ?d_row_stride (ECons ?d_col_stride (ENil)))))
-        (= ?c_col_stride (MIter))
-        (!= ?c_row_stride (MNum 0))
-        (= ?matmul_add_strides ?add_out_strides)
-        (= ?c_dtype (dtype ?c))
-    )
-    (
-        (let ?fused (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order "COL" "COL"
-            ?lda ?ldb ?c_row_stride ?ldd
-            ?batch
-            ?stride_a ?stride_b ?c_batch_stride ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype ?d_dtype
-            ?compute_type ?scale_dtype
-            ?alpha 1.0 ?epilogue)
-            (ICons ?a (ICons ?b (ICons ?c ?matmul_tail)))))
-        (union ?add ?fused)
-        (set (dtype ?fused) ?d_dtype)
-    )
-    :ruleset matmul_backend
-    :name "cublaslt batched c plus matmul beta"
-)
-
-; ROW-ordered D beta fusions. These pair with cublaslt_row_order_rewrite.egg,
-; where the cuBLASLt problem dimensions match Luminal's logical output [m,n].
-; A row-major C input with logical strides [row_stride, 1] maps directly to a
-; ROW-ordered cuBLASLt C[m,n] descriptor with ldc=row_stride.
-(rule
-    (
-        (= ?matmul (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order ?matmul_c_order "ROW"
-            ?lda ?ldb ?matmul_ldc ?ldd
-            (MNum 1)
-            ?stride_a ?stride_b ?matmul_stride_c ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype ?d_dtype
-            ?compute_type ?scale_dtype
-            ?alpha 0.0 ?epilogue)
-            (ICons ?a (ICons ?b ?matmul_tail))))
-        (!= ?epilogue "RELU")
-        (!= ?epilogue "RELU_BIAS")
-        (!= ?epilogue "GELU")
-        (!= ?epilogue "GELU_BIAS")
-
-        (= ?add (Op (Add
-            (ECons ?m (ECons ?n (ENil)))
-            ?matmul_add_strides
-            ?c_add_strides
-            ?add_out_strides)
-            (ICons ?matmul (ICons ?c (INil)))))
-
-        (= ?matmul_add_strides (ECons ?d_row_stride (ECons ?d_col_stride (ENil))))
-        (= ?c_add_strides (ECons ?c_row_stride (ECons ?c_col_stride (ENil))))
-        (= ?add_out_strides (ECons ?d_row_stride (ECons ?d_col_stride (ENil))))
-        (= ?c_col_stride (MIter))
-        (!= ?c_row_stride (MNum 0))
-        (= ?matmul_add_strides ?add_out_strides)
-        (= ?c_dtype (dtype ?c))
-    )
-    (
-        (let ?fused (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order "ROW" "ROW"
-            ?lda ?ldb ?c_row_stride ?ldd
-            (MNum 1)
-            ?stride_a ?stride_b (MNum 0) ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype ?d_dtype
-            ?compute_type ?scale_dtype
-            ?alpha 1.0 ?epilogue)
-            (ICons ?a (ICons ?b (ICons ?c ?matmul_tail)))))
-        (union ?add ?fused)
-        (set (dtype ?fused) ?d_dtype)
-    )
-    :ruleset matmul_backend
-    :name "cublaslt row-order 2d matmul plus c beta"
-)
-
-(rule
-    (
-        (= ?matmul (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order ?matmul_c_order "ROW"
-            ?lda ?ldb ?matmul_ldc ?ldd
-            (MNum 1)
-            ?stride_a ?stride_b ?matmul_stride_c ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype ?d_dtype
-            ?compute_type ?scale_dtype
-            ?alpha 0.0 ?epilogue)
-            (ICons ?a (ICons ?b ?matmul_tail))))
-        (!= ?epilogue "RELU")
-        (!= ?epilogue "RELU_BIAS")
-        (!= ?epilogue "GELU")
-        (!= ?epilogue "GELU_BIAS")
-
-        (= ?add (Op (Add
-            (ECons ?m (ECons ?n (ENil)))
-            ?c_add_strides
-            ?matmul_add_strides
-            ?add_out_strides)
-            (ICons ?c (ICons ?matmul (INil)))))
-
-        (= ?matmul_add_strides (ECons ?d_row_stride (ECons ?d_col_stride (ENil))))
-        (= ?c_add_strides (ECons ?c_row_stride (ECons ?c_col_stride (ENil))))
-        (= ?add_out_strides (ECons ?d_row_stride (ECons ?d_col_stride (ENil))))
-        (= ?c_col_stride (MIter))
-        (!= ?c_row_stride (MNum 0))
-        (= ?matmul_add_strides ?add_out_strides)
-        (= ?c_dtype (dtype ?c))
-    )
-    (
-        (let ?fused (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order "ROW" "ROW"
-            ?lda ?ldb ?c_row_stride ?ldd
-            (MNum 1)
-            ?stride_a ?stride_b (MNum 0) ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype ?d_dtype
-            ?compute_type ?scale_dtype
-            ?alpha 1.0 ?epilogue)
-            (ICons ?a (ICons ?b (ICons ?c ?matmul_tail)))))
-        (union ?add ?fused)
-        (set (dtype ?fused) ?d_dtype)
-    )
-    :ruleset matmul_backend
-    :name "cublaslt row-order 2d c plus matmul beta"
-)
-
-(rule
-    (
-        (= ?matmul (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order ?matmul_c_order "ROW"
-            ?lda ?ldb ?matmul_ldc ?ldd
-            ?batch
-            ?stride_a ?stride_b ?matmul_stride_c ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype ?d_dtype
-            ?compute_type ?scale_dtype
-            ?alpha 0.0 ?epilogue)
-            (ICons ?a (ICons ?b ?matmul_tail))))
-        (!= ?epilogue "RELU")
-        (!= ?epilogue "RELU_BIAS")
-        (!= ?epilogue "GELU")
-        (!= ?epilogue "GELU_BIAS")
-
-        (= ?add (Op (Add
-            (ECons ?batch (ECons ?m (ECons ?n (ENil))))
-            ?matmul_add_strides
-            ?c_add_strides
-            ?add_out_strides)
-            (ICons ?matmul (ICons ?c (INil)))))
-
-        (= ?matmul_add_strides (ECons ?d_batch_stride (ECons ?d_row_stride (ECons ?d_col_stride (ENil)))))
-        (= ?c_add_strides (ECons ?c_batch_stride (ECons ?c_row_stride (ECons ?c_col_stride (ENil)))))
-        (= ?add_out_strides (ECons ?d_batch_stride (ECons ?d_row_stride (ECons ?d_col_stride (ENil)))))
-        (= ?c_col_stride (MIter))
-        (!= ?c_row_stride (MNum 0))
-        (= ?matmul_add_strides ?add_out_strides)
-        (= ?c_dtype (dtype ?c))
-    )
-    (
-        (let ?fused (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order "ROW" "ROW"
-            ?lda ?ldb ?c_row_stride ?ldd
-            ?batch
-            ?stride_a ?stride_b ?c_batch_stride ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype ?d_dtype
-            ?compute_type ?scale_dtype
-            ?alpha 1.0 ?epilogue)
-            (ICons ?a (ICons ?b (ICons ?c ?matmul_tail)))))
-        (union ?add ?fused)
-        (set (dtype ?fused) ?d_dtype)
-    )
-    :ruleset matmul_backend
-    :name "cublaslt row-order batched matmul plus c beta"
-)
-
-(rule
-    (
-        (= ?matmul (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order ?matmul_c_order "ROW"
-            ?lda ?ldb ?matmul_ldc ?ldd
-            ?batch
-            ?stride_a ?stride_b ?matmul_stride_c ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype ?d_dtype
-            ?compute_type ?scale_dtype
-            ?alpha 0.0 ?epilogue)
-            (ICons ?a (ICons ?b ?matmul_tail))))
-        (!= ?epilogue "RELU")
-        (!= ?epilogue "RELU_BIAS")
-        (!= ?epilogue "GELU")
-        (!= ?epilogue "GELU_BIAS")
-
-        (= ?add (Op (Add
-            (ECons ?batch (ECons ?m (ECons ?n (ENil))))
-            ?c_add_strides
-            ?matmul_add_strides
-            ?add_out_strides)
-            (ICons ?c (ICons ?matmul (INil)))))
-
-        (= ?matmul_add_strides (ECons ?d_batch_stride (ECons ?d_row_stride (ECons ?d_col_stride (ENil)))))
-        (= ?c_add_strides (ECons ?c_batch_stride (ECons ?c_row_stride (ECons ?c_col_stride (ENil)))))
-        (= ?add_out_strides (ECons ?d_batch_stride (ECons ?d_row_stride (ECons ?d_col_stride (ENil)))))
-        (= ?c_col_stride (MIter))
-        (!= ?c_row_stride (MNum 0))
-        (= ?matmul_add_strides ?add_out_strides)
-        (= ?c_dtype (dtype ?c))
-    )
-    (
-        (let ?fused (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order "ROW" "ROW"
-            ?lda ?ldb ?c_row_stride ?ldd
-            ?batch
-            ?stride_a ?stride_b ?c_batch_stride ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype ?d_dtype
-            ?compute_type ?scale_dtype
-            ?alpha 1.0 ?epilogue)
-            (ICons ?a (ICons ?b (ICons ?c ?matmul_tail)))))
-        (union ?add ?fused)
-        (set (dtype ?fused) ?d_dtype)
-    )
-    :ruleset matmul_backend
-    :name "cublaslt row-order batched c plus matmul beta"
-)
--- a/crates/luminal_cuda_lite/src/host/cublaslt/cublaslt_epilogue_rewrite.egg
+++ b/crates/luminal_cuda_lite/src/host/cublaslt/cublaslt_epilogue_rewrite.egg
@@ -1,614 +0,0 @@
-; cuBLASLt epilogue rewrites.
-;
-; ReLU in the frontend lowers through maximum_f32(0.0):
-;
-;   (matmul < 0) * 0 + cast(cast((-cast(matmul < 0) + 1) as bool) as f32) * matmul
-;
-; These rules fuse that expression back into CUBLASLT_EPILOGUE_RELU.
-
-(rule
-    (
-        (= ?matmul (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order ?c_order ?d_order
-            ?lda ?ldb ?ldc ?ldd
-            ?batch
-            ?stride_a ?stride_b ?stride_c ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype (F32)
-            ?compute_type ?scale_dtype
-            ?alpha 0.0 "DEFAULT")
-            (ICons ?a (ICons ?b ?matmul_tail))))
-
-        (= ?zero (Op (Constant 0.0) (INil)))
-        (= ?neg_one (Op (Constant -1.0) (INil)))
-        (= ?one (Op (Constant 1.0) (INil)))
-
-        (= ?lt (Op (LessThan
-            ?shape
-            ?matmul_strides
-            (ECons (MNum 0) (ECons (MNum 0) (ENil)))
-            ?mask_strides)
-            (ICons ?matmul (ICons ?zero (INil)))))
-        (= ?lt_f32 (Op (Cast ?size (F32)) (ICons ?lt (INil))))
-
-        (= ?zeroed (Op (Mul
-            ?shape
-            ?mask_strides
-            (ECons (MNum 0) (ECons (MNum 0) (ENil)))
-            ?zeroed_strides)
-            (ICons ?lt_f32 (ICons ?zero (INil)))))
-
-        (= ?neg_mask (Op (Mul
-            ?shape
-            ?mask_strides
-            (ECons (MNum 0) (ECons (MNum 0) (ENil)))
-            ?neg_mask_strides)
-            (ICons ?lt_f32 (ICons ?neg_one (INil)))))
-        (= ?not_mask_f32 (Op (Add
-            ?shape
-            ?neg_mask_strides
-            (ECons (MNum 0) (ECons (MNum 0) (ENil)))
-            ?not_mask_f32_strides)
-            (ICons ?neg_mask (ICons ?one (INil)))))
-        (= ?not_mask_bool (Op (Cast ?size (Bool)) (ICons ?not_mask_f32 (INil))))
-        (= ?not_mask (Op (Cast ?size (F32)) (ICons ?not_mask_bool (INil))))
-
-        (= ?positive (Op (Mul
-            ?shape
-            ?not_mask_f32_strides
-            ?matmul_strides
-            ?positive_strides)
-            (ICons ?not_mask (ICons ?matmul (INil)))))
-        (= ?relu (Op (Add
-            ?shape
-            ?zeroed_strides
-            ?positive_strides
-            ?relu_strides)
-            (ICons ?zeroed (ICons ?positive (INil)))))
-    )
-    (
-        (let ?fused (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order ?c_order ?d_order
-            ?lda ?ldb ?ldc ?ldd
-            ?batch
-            ?stride_a ?stride_b ?stride_c ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype (F32)
-            ?compute_type ?scale_dtype
-            ?alpha 0.0 "RELU")
-            (ICons ?a (ICons ?b ?matmul_tail))))
-        (union ?relu ?fused)
-        (set (dtype ?fused) (F32))
-    )
-    :ruleset matmul_backend
-    :name "cublaslt 2d relu epilogue"
-)
-
-(rule
-    (
-        (= ?matmul (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order ?c_order ?d_order
-            ?lda ?ldb ?ldc ?ldd
-            ?batch
-            ?stride_a ?stride_b ?stride_c ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype (F32)
-            ?compute_type ?scale_dtype
-            ?alpha 0.0 "DEFAULT")
-            (ICons ?a (ICons ?b ?matmul_tail))))
-
-        (= ?zero (Op (Constant 0.0) (INil)))
-        (= ?neg_one (Op (Constant -1.0) (INil)))
-        (= ?one (Op (Constant 1.0) (INil)))
-
-        (= ?lt (Op (LessThan
-            ?shape
-            ?matmul_strides
-            (ECons (MNum 0) (ECons (MNum 0) (ECons (MNum 0) (ENil))))
-            ?mask_strides)
-            (ICons ?matmul (ICons ?zero (INil)))))
-        (= ?lt_f32 (Op (Cast ?size (F32)) (ICons ?lt (INil))))
-
-        (= ?zeroed (Op (Mul
-            ?shape
-            ?mask_strides
-            (ECons (MNum 0) (ECons (MNum 0) (ECons (MNum 0) (ENil))))
-            ?zeroed_strides)
-            (ICons ?lt_f32 (ICons ?zero (INil)))))
-
-        (= ?neg_mask (Op (Mul
-            ?shape
-            ?mask_strides
-            (ECons (MNum 0) (ECons (MNum 0) (ECons (MNum 0) (ENil))))
-            ?neg_mask_strides)
-            (ICons ?lt_f32 (ICons ?neg_one (INil)))))
-        (= ?not_mask_f32 (Op (Add
-            ?shape
-            ?neg_mask_strides
-            (ECons (MNum 0) (ECons (MNum 0) (ECons (MNum 0) (ENil))))
-            ?not_mask_f32_strides)
-            (ICons ?neg_mask (ICons ?one (INil)))))
-        (= ?not_mask_bool (Op (Cast ?size (Bool)) (ICons ?not_mask_f32 (INil))))
-        (= ?not_mask (Op (Cast ?size (F32)) (ICons ?not_mask_bool (INil))))
-
-        (= ?positive (Op (Mul
-            ?shape
-            ?not_mask_f32_strides
-            ?matmul_strides
-            ?positive_strides)
-            (ICons ?not_mask (ICons ?matmul (INil)))))
-        (= ?relu (Op (Add
-            ?shape
-            ?zeroed_strides
-            ?positive_strides
-            ?relu_strides)
-            (ICons ?zeroed (ICons ?positive (INil)))))
-    )
-    (
-        (let ?fused (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order ?c_order ?d_order
-            ?lda ?ldb ?ldc ?ldd
-            ?batch
-            ?stride_a ?stride_b ?stride_c ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype (F32)
-            ?compute_type ?scale_dtype
-            ?alpha 0.0 "RELU")
-            (ICons ?a (ICons ?b ?matmul_tail))))
-        (union ?relu ?fused)
-        (set (dtype ?fused) (F32))
-    )
-    :ruleset matmul_backend
-    :name "cublaslt batched relu epilogue"
-)
-
-(rule
-    (
-        (= ?matmul (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order ?c_order ?d_order
-            ?lda ?ldb ?ldc ?ldd
-            ?batch
-            ?stride_a ?stride_b ?stride_c ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype (F32)
-            ?compute_type ?scale_dtype
-            ?alpha 0.0 "BIAS")
-            (ICons ?a (ICons ?b ?matmul_tail))))
-
-        (= ?zero (Op (Constant 0.0) (INil)))
-        (= ?neg_one (Op (Constant -1.0) (INil)))
-        (= ?one (Op (Constant 1.0) (INil)))
-
-        (= ?lt (Op (LessThan
-            ?shape
-            ?matmul_strides
-            (ECons (MNum 0) (ECons (MNum 0) (ENil)))
-            ?mask_strides)
-            (ICons ?matmul (ICons ?zero (INil)))))
-        (= ?lt_f32 (Op (Cast ?size (F32)) (ICons ?lt (INil))))
-
-        (= ?zeroed (Op (Mul
-            ?shape
-            ?mask_strides
-            (ECons (MNum 0) (ECons (MNum 0) (ENil)))
-            ?zeroed_strides)
-            (ICons ?lt_f32 (ICons ?zero (INil)))))
-
-        (= ?neg_mask (Op (Mul
-            ?shape
-            ?mask_strides
-            (ECons (MNum 0) (ECons (MNum 0) (ENil)))
-            ?neg_mask_strides)
-            (ICons ?lt_f32 (ICons ?neg_one (INil)))))
-        (= ?not_mask_f32 (Op (Add
-            ?shape
-            ?neg_mask_strides
-            (ECons (MNum 0) (ECons (MNum 0) (ENil)))
-            ?not_mask_f32_strides)
-            (ICons ?neg_mask (ICons ?one (INil)))))
-        (= ?not_mask_bool (Op (Cast ?size (Bool)) (ICons ?not_mask_f32 (INil))))
-        (= ?not_mask (Op (Cast ?size (F32)) (ICons ?not_mask_bool (INil))))
-
-        (= ?positive (Op (Mul
-            ?shape
-            ?not_mask_f32_strides
-            ?matmul_strides
-            ?positive_strides)
-            (ICons ?not_mask (ICons ?matmul (INil)))))
-        (= ?relu (Op (Add
-            ?shape
-            ?zeroed_strides
-            ?positive_strides
-            ?relu_strides)
-            (ICons ?zeroed (ICons ?positive (INil)))))
-    )
-    (
-        (let ?fused (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order ?c_order ?d_order
-            ?lda ?ldb ?ldc ?ldd
-            ?batch
-            ?stride_a ?stride_b ?stride_c ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype (F32)
-            ?compute_type ?scale_dtype
-            ?alpha 0.0 "RELU_BIAS")
-            (ICons ?a (ICons ?b ?matmul_tail))))
-        (union ?relu ?fused)
-        (set (dtype ?fused) (F32))
-    )
-    :ruleset matmul_backend
-    :name "cublaslt 2d relu bias epilogue"
-)
-
-(rule
-    (
-        (= ?matmul (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order ?c_order ?d_order
-            ?lda ?ldb ?ldc ?ldd
-            ?batch
-            ?stride_a ?stride_b ?stride_c ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype (F32)
-            ?compute_type ?scale_dtype
-            ?alpha 0.0 "BIAS")
-            (ICons ?a (ICons ?b ?matmul_tail))))
-
-        (= ?zero (Op (Constant 0.0) (INil)))
-        (= ?neg_one (Op (Constant -1.0) (INil)))
-        (= ?one (Op (Constant 1.0) (INil)))
-
-        (= ?lt (Op (LessThan
-            ?shape
-            ?matmul_strides
-            (ECons (MNum 0) (ECons (MNum 0) (ECons (MNum 0) (ENil))))
-            ?mask_strides)
-            (ICons ?matmul (ICons ?zero (INil)))))
-        (= ?lt_f32 (Op (Cast ?size (F32)) (ICons ?lt (INil))))
-
-        (= ?zeroed (Op (Mul
-            ?shape
-            ?mask_strides
-            (ECons (MNum 0) (ECons (MNum 0) (ECons (MNum 0) (ENil))))
-            ?zeroed_strides)
-            (ICons ?lt_f32 (ICons ?zero (INil)))))
-
-        (= ?neg_mask (Op (Mul
-            ?shape
-            ?mask_strides
-            (ECons (MNum 0) (ECons (MNum 0) (ECons (MNum 0) (ENil))))
-            ?neg_mask_strides)
-            (ICons ?lt_f32 (ICons ?neg_one (INil)))))
-        (= ?not_mask_f32 (Op (Add
-            ?shape
-            ?neg_mask_strides
-            (ECons (MNum 0) (ECons (MNum 0) (ECons (MNum 0) (ENil))))
-            ?not_mask_f32_strides)
-            (ICons ?neg_mask (ICons ?one (INil)))))
-        (= ?not_mask_bool (Op (Cast ?size (Bool)) (ICons ?not_mask_f32 (INil))))
-        (= ?not_mask (Op (Cast ?size (F32)) (ICons ?not_mask_bool (INil))))
-
-        (= ?positive (Op (Mul
-            ?shape
-            ?not_mask_f32_strides
-            ?matmul_strides
-            ?positive_strides)
-            (ICons ?not_mask (ICons ?matmul (INil)))))
-        (= ?relu (Op (Add
-            ?shape
-            ?zeroed_strides
-            ?positive_strides
-            ?relu_strides)
-            (ICons ?zeroed (ICons ?positive (INil)))))
-    )
-    (
-        (let ?fused (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order ?c_order ?d_order
-            ?lda ?ldb ?ldc ?ldd
-            ?batch
-            ?stride_a ?stride_b ?stride_c ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype (F32)
-            ?compute_type ?scale_dtype
-            ?alpha 0.0 "RELU_BIAS")
-            (ICons ?a (ICons ?b ?matmul_tail))))
-        (union ?relu ?fused)
-        (set (dtype ?fused) (F32))
-    )
-    :ruleset matmul_backend
-    :name "cublaslt batched relu bias epilogue"
-)
-
-; Canonical tanh-approx GELU can also appear directly as:
-;
-;   x * sigmoid(1.5957691216 * x * (1 + 0.044715 * x * x))
-;
-; Match that sigmoid form and fuse it into the cuBLASLt GELU epilogues.
-
-(rule
-    (
-        (= ?matmul (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order ?c_order ?d_order
-            ?lda ?ldb ?ldc ?ldd
-            ?batch
-            ?stride_a ?stride_b ?stride_c ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype (F32)
-            ?compute_type ?scale_dtype
-            ?alpha 0.0 "DEFAULT")
-            (ICons ?a (ICons ?b ?matmul_tail))))
-
-        (= ?gelu_coeff_inner (Op (Constant 0.044715) (INil)))
-        (= ?gelu_inner_scaled (Op (Mul ?gelu_inner_scaled_shape ?gelu_inner_scaled_a_stride ?gelu_inner_scaled_b_stride ?gelu_inner_scaled_out_stride) (ICons ?matmul (ICons ?gelu_coeff_inner (INil)))))
-        (= ?gelu_inner_quad (Op (Mul ?gelu_inner_quad_shape ?gelu_inner_quad_a_stride ?gelu_inner_quad_b_stride ?gelu_inner_quad_out_stride) (ICons ?gelu_inner_scaled (ICons ?matmul (INil)))))
-        (= ?gelu_one (Op (Constant 1.000000) (INil)))
-        (= ?gelu_poly (Op (Add ?gelu_poly_shape ?gelu_poly_a_stride ?gelu_poly_b_stride ?gelu_poly_out_stride) (ICons ?gelu_inner_quad (ICons ?gelu_one (INil)))))
-        (= ?gelu_coeff_outer (Op (Constant 1.595769) (INil)))
-        (= ?gelu_outer_scaled (Op (Mul ?gelu_outer_scaled_shape ?gelu_outer_scaled_a_stride ?gelu_outer_scaled_b_stride ?gelu_outer_scaled_out_stride) (ICons ?matmul (ICons ?gelu_coeff_outer (INil)))))
-        (= ?gelu_scaled (Op (Mul ?gelu_scaled_shape ?gelu_scaled_a_stride ?gelu_scaled_b_stride ?gelu_scaled_out_stride) (ICons ?gelu_outer_scaled (ICons ?gelu_poly (INil)))))
-        (= ?neg1 (Op (Constant -1.000000) (INil)))
-        (= ?gelu_neg (Op (Mul ?gelu_neg_shape ?gelu_neg_a_stride ?gelu_neg_b_stride ?gelu_neg_out_stride) (ICons ?gelu_scaled (ICons ?neg1 (INil)))))
-        (= ?log2e (Op (Constant 1.442695) (INil)))
-        (= ?gelu_exp_scaled (Op (Mul ?gelu_exp_scaled_shape ?gelu_exp_scaled_a_stride ?gelu_exp_scaled_b_stride ?gelu_exp_scaled_out_stride) (ICons ?gelu_neg (ICons ?log2e (INil)))))
-        (= ?gelu_exp2_val (Op (Exp2 ?gelu_exp_shape ?gelu_exp_in_stride ?gelu_exp_out_stride) (ICons ?gelu_exp_scaled (INil))))
-        (= ?gelu_plus1 (Op (Add ?gelu_plus1_shape ?gelu_plus1_a_stride ?gelu_plus1_b_stride ?gelu_plus1_out_stride) (ICons ?gelu_exp2_val (ICons ?gelu_one (INil)))))
-        (= ?gelu_sigmoid (Op (Recip ?gelu_sigmoid_shape ?gelu_sigmoid_in_stride ?gelu_sigmoid_out_stride) (ICons ?gelu_plus1 (INil))))
-        (= ?gelu_out (Op (Mul ?gelu_out_shape ?gelu_out_a_stride ?gelu_out_b_stride ?gelu_out_out_stride) (ICons ?matmul (ICons ?gelu_sigmoid (INil)))))
-    )
-    (
-        (let ?fused (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order ?c_order ?d_order
-            ?lda ?ldb ?ldc ?ldd
-            ?batch
-            ?stride_a ?stride_b ?stride_c ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype (F32)
-            ?compute_type ?scale_dtype
-            ?alpha 0.0 "GELU")
-            (ICons ?a (ICons ?b ?matmul_tail))))
-        (union ?gelu_out ?fused)
-        (set (dtype ?fused) (F32))
-    )
-    :ruleset matmul_backend
-    :name "cublaslt gelu epilogue"
-)
-
-(rule
-    (
-        (= ?matmul (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order ?c_order ?d_order
-            ?lda ?ldb ?ldc ?ldd
-            ?batch
-            ?stride_a ?stride_b ?stride_c ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype (F32)
-            ?compute_type ?scale_dtype
-            ?alpha 0.0 "BIAS")
-            (ICons ?a (ICons ?b ?matmul_tail))))
-
-        (= ?gelu_coeff_inner (Op (Constant 0.044715) (INil)))
-        (= ?gelu_inner_scaled (Op (Mul ?gelu_inner_scaled_shape ?gelu_inner_scaled_a_stride ?gelu_inner_scaled_b_stride ?gelu_inner_scaled_out_stride) (ICons ?matmul (ICons ?gelu_coeff_inner (INil)))))
-        (= ?gelu_inner_quad (Op (Mul ?gelu_inner_quad_shape ?gelu_inner_quad_a_stride ?gelu_inner_quad_b_stride ?gelu_inner_quad_out_stride) (ICons ?gelu_inner_scaled (ICons ?matmul (INil)))))
-        (= ?gelu_one (Op (Constant 1.000000) (INil)))
-        (= ?gelu_poly (Op (Add ?gelu_poly_shape ?gelu_poly_a_stride ?gelu_poly_b_stride ?gelu_poly_out_stride) (ICons ?gelu_inner_quad (ICons ?gelu_one (INil)))))
-        (= ?gelu_coeff_outer (Op (Constant 1.595769) (INil)))
-        (= ?gelu_outer_scaled (Op (Mul ?gelu_outer_scaled_shape ?gelu_outer_scaled_a_stride ?gelu_outer_scaled_b_stride ?gelu_outer_scaled_out_stride) (ICons ?matmul (ICons ?gelu_coeff_outer (INil)))))
-        (= ?gelu_scaled (Op (Mul ?gelu_scaled_shape ?gelu_scaled_a_stride ?gelu_scaled_b_stride ?gelu_scaled_out_stride) (ICons ?gelu_outer_scaled (ICons ?gelu_poly (INil)))))
-        (= ?neg1 (Op (Constant -1.000000) (INil)))
-        (= ?gelu_neg (Op (Mul ?gelu_neg_shape ?gelu_neg_a_stride ?gelu_neg_b_stride ?gelu_neg_out_stride) (ICons ?gelu_scaled (ICons ?neg1 (INil)))))
-        (= ?log2e (Op (Constant 1.442695) (INil)))
-        (= ?gelu_exp_scaled (Op (Mul ?gelu_exp_scaled_shape ?gelu_exp_scaled_a_stride ?gelu_exp_scaled_b_stride ?gelu_exp_scaled_out_stride) (ICons ?gelu_neg (ICons ?log2e (INil)))))
-        (= ?gelu_exp2_val (Op (Exp2 ?gelu_exp_shape ?gelu_exp_in_stride ?gelu_exp_out_stride) (ICons ?gelu_exp_scaled (INil))))
-        (= ?gelu_plus1 (Op (Add ?gelu_plus1_shape ?gelu_plus1_a_stride ?gelu_plus1_b_stride ?gelu_plus1_out_stride) (ICons ?gelu_exp2_val (ICons ?gelu_one (INil)))))
-        (= ?gelu_sigmoid (Op (Recip ?gelu_sigmoid_shape ?gelu_sigmoid_in_stride ?gelu_sigmoid_out_stride) (ICons ?gelu_plus1 (INil))))
-        (= ?gelu_out (Op (Mul ?gelu_out_shape ?gelu_out_a_stride ?gelu_out_b_stride ?gelu_out_out_stride) (ICons ?matmul (ICons ?gelu_sigmoid (INil)))))
-    )
-    (
-        (let ?fused (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order ?c_order ?d_order
-            ?lda ?ldb ?ldc ?ldd
-            ?batch
-            ?stride_a ?stride_b ?stride_c ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype (F32)
-            ?compute_type ?scale_dtype
-            ?alpha 0.0 "GELU_BIAS")
-            (ICons ?a (ICons ?b ?matmul_tail))))
-        (union ?gelu_out ?fused)
-        (set (dtype ?fused) (F32))
-    )
-    :ruleset matmul_backend
-    :name "cublaslt gelu bias epilogue"
-)
-
-; This first slice fuses column-bias adds into CUBLASLT_EPILOGUE_BIAS for the
-; older COL-ordered output view. In that view Luminal's logical [m,n] output is
-; represented as a cuBLASLt [n,m] matrix, so cuBLASLt's row-broadcast bias maps
-; to the common logical column bias of length n.
-
-(rule
-    (
-        (= ?matmul (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order ?c_order "COL"
-            ?lda ?ldb ?ldc ?ldd
-            (MNum 1)
-            ?stride_a ?stride_b ?stride_c ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype ?d_dtype
-            ?compute_type ?scale_dtype
-            ?alpha 0.0 "DEFAULT")
-            (ICons ?a (ICons ?b (INil)))))
-
-        (= ?add (Op (Add
-            (ECons ?n (ECons ?m (ENil)))
-            ?matmul_add_strides
-            ?bias_add_strides
-            ?add_out_strides)
-            (ICons ?matmul (ICons ?bias (INil)))))
-
-        (= ?bias_add_strides (ECons (MNum 0) (ECons (MIter) (ENil))))
-        (= ?matmul_add_strides ?add_out_strides)
-        (= ?d_dtype (dtype ?bias))
-    )
-    (
-        (let ?fused (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order ?c_order "COL"
-            ?lda ?ldb ?ldc ?ldd
-            (MNum 1)
-            ?stride_a ?stride_b ?stride_c ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype ?d_dtype
-            ?compute_type ?scale_dtype
-            ?alpha 0.0 "BIAS")
-            (ICons ?a (ICons ?b (ICons ?bias (INil))))))
-        (union ?add ?fused)
-        (set (dtype ?fused) ?d_dtype)
-    )
-    :ruleset matmul_backend
-    :name "cublaslt 2d matmul plus column bias epilogue"
-)
-
-(rule
-    (
-        (= ?matmul (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order ?c_order "COL"
-            ?lda ?ldb ?ldc ?ldd
-            (MNum 1)
-            ?stride_a ?stride_b ?stride_c ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype ?d_dtype
-            ?compute_type ?scale_dtype
-            ?alpha 0.0 "DEFAULT")
-            (ICons ?a (ICons ?b (INil)))))
-
-        (= ?add (Op (Add
-            (ECons ?n (ECons ?m (ENil)))
-            ?bias_add_strides
-            ?matmul_add_strides
-            ?add_out_strides)
-            (ICons ?bias (ICons ?matmul (INil)))))
-
-        (= ?bias_add_strides (ECons (MNum 0) (ECons (MIter) (ENil))))
-        (= ?matmul_add_strides ?add_out_strides)
-        (= ?d_dtype (dtype ?bias))
-    )
-    (
-        (let ?fused (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order ?c_order "COL"
-            ?lda ?ldb ?ldc ?ldd
-            (MNum 1)
-            ?stride_a ?stride_b ?stride_c ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype ?d_dtype
-            ?compute_type ?scale_dtype
-            ?alpha 0.0 "BIAS")
-            (ICons ?a (ICons ?b (ICons ?bias (INil))))))
-        (union ?add ?fused)
-        (set (dtype ?fused) ?d_dtype)
-    )
-    :ruleset matmul_backend
-    :name "cublaslt 2d column bias plus matmul epilogue"
-)
-
-(rule
-    (
-        (= ?matmul (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order ?c_order "COL"
-            ?lda ?ldb ?ldc ?ldd
-            ?batch
-            ?stride_a ?stride_b ?stride_c ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype ?d_dtype
-            ?compute_type ?scale_dtype
-            ?alpha 0.0 "DEFAULT")
-            (ICons ?a (ICons ?b (INil)))))
-
-        (= ?add (Op (Add
-            (ECons ?batch (ECons ?n (ECons ?m (ENil))))
-            ?matmul_add_strides
-            ?bias_add_strides
-            ?add_out_strides)
-            (ICons ?matmul (ICons ?bias (INil)))))
-
-        (= ?bias_add_strides (ECons (MNum 0) (ECons (MNum 0) (ECons (MIter) (ENil)))))
-        (= ?matmul_add_strides ?add_out_strides)
-        (= ?d_dtype (dtype ?bias))
-    )
-    (
-        (let ?fused (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order ?c_order "COL"
-            ?lda ?ldb ?ldc ?ldd
-            ?batch
-            ?stride_a ?stride_b ?stride_c ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype ?d_dtype
-            ?compute_type ?scale_dtype
-            ?alpha 0.0 "BIAS")
-            (ICons ?a (ICons ?b (ICons ?bias (INil))))))
-        (union ?add ?fused)
-        (set (dtype ?fused) ?d_dtype)
-    )
-    :ruleset matmul_backend
-    :name "cublaslt batched matmul plus column bias epilogue"
-)
-
-(rule
-    (
-        (= ?matmul (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order ?c_order "COL"
-            ?lda ?ldb ?ldc ?ldd
-            ?batch
-            ?stride_a ?stride_b ?stride_c ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype ?d_dtype
-            ?compute_type ?scale_dtype
-            ?alpha 0.0 "DEFAULT")
-            (ICons ?a (ICons ?b (INil)))))
-
-        (= ?add (Op (Add
-            (ECons ?batch (ECons ?n (ECons ?m (ENil))))
-            ?bias_add_strides
-            ?matmul_add_strides
-            ?add_out_strides)
-            (ICons ?bias (ICons ?matmul (INil)))))
-
-        (= ?bias_add_strides (ECons (MNum 0) (ECons (MNum 0) (ECons (MIter) (ENil)))))
-        (= ?matmul_add_strides ?add_out_strides)
-        (= ?d_dtype (dtype ?bias))
-    )
-    (
-        (let ?fused (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order ?c_order "COL"
-            ?lda ?ldb ?ldc ?ldd
-            ?batch
-            ?stride_a ?stride_b ?stride_c ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype ?d_dtype
-            ?compute_type ?scale_dtype
-            ?alpha 0.0 "BIAS")
-            (ICons ?a (ICons ?b (ICons ?bias (INil))))))
-        (union ?add ?fused)
-        (set (dtype ?fused) ?d_dtype)
-    )
-    :ruleset matmul_backend
-    :name "cublaslt batched column bias plus matmul epilogue"
-)
--- a/crates/luminal_cuda_lite/src/host/cublaslt/cublaslt_fp8_rewrite.egg
+++ b/crates/luminal_cuda_lite/src/host/cublaslt/cublaslt_fp8_rewrite.egg
@@ -1,775 +0,0 @@
-; FP8 support is narrower than "any FP8 x any FP8". cuBLASLt's regular FP8
-; matmul table supports these A/B descriptor pairs for F32 outputs:
-;   E4M3 x E4M3
-;   E4M3 x E5M2
-;   E5M2 x E4M3
-; and requires TN format on Ada/Hopper-class GPUs. These rules therefore match
-; row-major x column-major Luminal matmuls, which the existing COL-order lowering
-; describes as descriptor A = logical B, descriptor B = logical A, transa=T,
-; transb=N.
-
-(rule
-    (
-        ; Match the scaled FP8 linear form directly before the unscaled FP8
-        ; matmul rewrite can hide the quantize/dequant scale structure.
-        (= ?scaled_activation (Op (Mul
-            ?activation_shape
-            ?raw_activation_strides
-            ?recip_activation_strides
-            ?activation_out_strides)
-            (ICons ?raw_activation (ICons ?recip_input_scale (INil)))))
-        (= ?recip_input_scale (Op (Recip
-            ?activation_shape
-            (ECons (MNum 0) (ECons (MNum 0) (ENil)))
-            ?recip_out_strides)
-            (ICons ?input_scale (INil))))
-        (= ?a (Op (Cast ?a_size ?a_dtype) (ICons ?scaled_activation (INil))))
-
-        (= ?mul (Op (Mul ?mul_shape ?a_stride ?b_stride ?mul_out_stride) (ICons ?a (ICons ?b (INil)))))
-        (= ?sum (Op (Sum ?out_shape ?k ?sum_in_stride ?k_stride ?sum_out_stride) (ICons ?mul (INil))))
-        (= ?cast (Op (Cast ?size (F32)) (ICons ?sum (INil))))
-        (= ?scale_product (Op (Mul (ENil) (ENil) (ENil) (ENil))
-            (ICons ?input_scale (ICons ?weight_scale (INil)))))
-        (= ?scaled (Op (Mul
-            ?out_shape
-            ?cast_strides
-            (ECons (MNum 0) (ECons (MNum 0) (ENil)))
-            ?scaled_out_strides)
-            (ICons ?cast (ICons ?scale_product (INil)))))
-        (= ?cast_strides ?scaled_out_strides)
-
-        (= ?out_shape (ECons ?m (ECons ?n (ENil))))
-        (!= ?m (MNum 0))
-        (!= ?n (MNum 0))
-        (!= ?k (MNum 1))
-
-        (= ?a_stride (ECons ?a_m_stride (ECons ?a_n_stride (ECons ?a_k_stride (ENil)))))
-        (= ?b_stride (ECons ?b_m_stride (ECons ?b_n_stride (ECons ?b_k_stride (ENil)))))
-        (= ?k_stride (MIter))
-
-        (= ?a_m_stride (MMul (MIter) ?k))
-        (= ?a_n_stride (MNum 0))
-        (= ?a_k_stride (MIter))
-
-        (= ?b_m_stride (MNum 0))
-        (= ?b_n_stride (MMul (MIter) ?k))
-        (= ?b_k_stride (MIter))
-
-        (= ?b_dtype (dtype ?b))
-        (cublaslt_fp8_f32_output_pair ?a_dtype ?b_dtype)
-    )
-    (
-        (let ?sgemm (Op (cublaslt_scaled
-            ?n ?m ?k
-            "T" "N"
-            "COL" "COL" "COL" "COL"
-            ?b_n_stride
-            ?a_m_stride
-            ?n
-            ?n
-            (MNum 1)
-            (MNum 0)
-            (MNum 0)
-            (MNum 0)
-            (MNum 0)
-            ?b_dtype ?a_dtype (F32) (F32) "32F" "F32" 1.0 0.0 "DEFAULT")
-            (ICons ?b (ICons ?a (ICons ?weight_scale (ICons ?input_scale (INil)))))))
-        (union ?scaled ?sgemm)
-        (set (dtype ?sgemm) (F32))
-    )
-:ruleset matmul_backend
-:name "cublaslt scaled fp8 row-major x column-major f32 output"
-)
-
-(rule
-    (
-        (= ?scaled_activation (Op (Mul
-            ?activation_shape
-            ?raw_activation_strides
-            ?recip_activation_strides
-            ?activation_out_strides)
-            (ICons ?raw_activation (ICons ?recip_input_scale (INil)))))
-        (= ?recip_input_scale (Op (Recip
-            ?activation_shape
-            (ECons (MNum 0) (ECons (MNum 0) (ENil)))
-            ?recip_out_strides)
-            (ICons ?input_scale (INil))))
-        (= ?a (Op (Cast ?a_size ?a_dtype) (ICons ?scaled_activation (INil))))
-
-        (= ?mul (Op (Mul ?mul_shape ?a_stride ?b_stride ?mul_out_stride) (ICons ?a (ICons ?b (INil)))))
-        (= ?sum (Op (Sum ?out_shape ?k ?sum_in_stride ?k_stride ?sum_out_stride) (ICons ?mul (INil))))
-        (= ?cast (Op (Cast ?size (F32)) (ICons ?sum (INil))))
-        (= ?scale_product (Op (Mul (ENil) (ENil) (ENil) (ENil))
-            (ICons ?input_scale (ICons ?weight_scale (INil)))))
-        (= ?scaled (Op (Mul
-            ?out_shape
-            ?cast_strides
-            (ECons (MNum 0) (ECons (MNum 0) (ENil)))
-            ?scaled_out_strides)
-            (ICons ?cast (ICons ?scale_product (INil)))))
-        (= ?cast_strides ?scaled_out_strides)
-
-        (= ?out_shape (ECons ?m (ECons ?n (ENil))))
-        (!= ?m (MNum 0))
-        (!= ?n (MNum 0))
-        (!= ?k (MNum 1))
-
-        (= ?a_stride (ECons ?a_m_stride (ECons ?a_n_stride (ECons ?a_k_stride (ENil)))))
-        (= ?b_stride (ECons ?b_m_stride (ECons ?b_n_stride (ECons ?b_k_stride (ENil)))))
-        (= ?k_stride (MIter))
-
-        (= ?a_m_stride (MMul (MIter) ?k))
-        (= ?a_n_stride (MNum 0))
-        (= ?a_k_stride (MIter))
-
-        (= ?b_m_stride (MNum 0))
-        (= ?b_n_stride (MMul (MIter) ?k))
-        (= ?b_k_stride (MIter))
-
-        (= ?b_dtype (dtype ?b))
-        (cublaslt_fp8_f32_output_pair ?a_dtype ?b_dtype)
-        (= ?scaled (Op (cublaslt_scaled
-            ?n ?m ?k
-            "T" "N"
-            "COL" "COL" "COL" "COL"
-            ?b_n_stride
-            ?a_m_stride
-            ?n
-            ?n
-            (MNum 1)
-            (MNum 0)
-            (MNum 0)
-            (MNum 0)
-            (MNum 0)
-            ?b_dtype ?a_dtype (F32) (F32) "32F" "F32" 1.0 0.0 "DEFAULT")
-            (ICons ?b (ICons ?a (ICons ?weight_scale (ICons ?input_scale (INil)))))))
-        (= ?cast (Op (cublaslt
-            ?n ?m ?k
-            "T" "N"
-            "COL" "COL" "COL" "COL"
-            ?b_n_stride
-            ?a_m_stride
-            ?n
-            ?n
-            (MNum 1)
-            (MNum 0)
-            (MNum 0)
-            (MNum 0)
-            (MNum 0)
-            ?b_dtype ?a_dtype (F32) (F32) "32F" "F32" 1.0 0.0 "DEFAULT")
-            (ICons ?b (ICons ?a (INil)))))
-    )
-    (
-        (delete (Op (Mul
-            ?out_shape
-            ?cast_strides
-            (ECons (MNum 0) (ECons (MNum 0) (ENil)))
-            ?scaled_out_strides)
-            (ICons ?cast (ICons ?scale_product (INil)))))
-        (delete (Op (cublaslt
-            ?n ?m ?k
-            "T" "N"
-            "COL" "COL" "COL" "COL"
-            ?b_n_stride
-            ?a_m_stride
-            ?n
-            ?n
-            (MNum 1)
-            (MNum 0)
-            (MNum 0)
-            (MNum 0)
-            (MNum 0)
-            ?b_dtype ?a_dtype (F32) (F32) "32F" "F32" 1.0 0.0 "DEFAULT")
-            (ICons ?b (ICons ?a (INil)))))
-    )
-    :ruleset cleanup
-    :name "delete raw fp8 path when scaled cublaslt covers direct output scale"
-)
-
-(rule
-    (
-        ; Fusion growth can make the live path consume a raw FP8 cuBLASLt
-        ; candidate through an internal CudaBinaryElementwise scale multiply,
-        ; instead of the original HLIR output-scale Mul. The scalar scale
-        ; product is tensor-wide, so the two scalar factors can be passed as
-        ; cuBLASLt A/B scale inputs and the internal multiply can be bypassed.
-        (= ?raw_gemm (Op (cublaslt
-            ?cm ?cn ?ck
-            ?cta ?ctb
-            ?cao ?cbo ?cco ?cdo
-            ?clda ?cldb ?cldc ?cldd
-            ?cbc ?csa ?csb ?csc ?csd
-            ?cadt ?cbdt ?ccdt ?cddt ?ccompute ?cscale ?calpha ?cbeta ?cepilogue)
-            (ICons ?a (ICons ?b (INil)))))
-        (cublaslt_fp8_f32_output_pair ?cadt ?cbdt)
-        (= ?ccdt (F32))
-        (= ?cddt (F32))
-        (= ?cbeta 0.0)
-        (= ?cepilogue "DEFAULT")
-
-        (= ?fs_cast (Op (FusionStart
-            ?out_shape
-            ?cast_strides
-            (F32))
-            (ICons ?raw_gemm (INil))))
-
-        (= ?out_shape (ECons ?out_m (ECons ?out_n (ENil))))
-        (= ?scale_strides (ECons (MNum 0) (ECons (MNum 0) (ENil))))
-
-        (= ?fs_a_scale (Op (FusionStart (ENil) (ENil) (F32))
-            (ICons ?a_scale (INil))))
-        (= ?fs_b_scale (Op (FusionStart (ENil) (ENil) (F32))
-            (ICons ?b_scale (INil))))
-        (= ?scale_product_inner (Op (CudaBinaryElementwise
-            "Mul"
-            (ENil)
-            (ENil)
-            (ENil)
-            (ENil)
-            (F32))
-            (ICons ?fs_a_scale (ICons ?fs_b_scale (INil)))))
-        (= ?scale_product (Op (FusionEnd (ENil) (ENil) (F32))
-            (ICons ?scale_product_inner (INil))))
-        (= ?fs_scale (Op (FusionStart
-            ?out_shape
-            ?scale_strides
-            (F32))
-            (ICons ?scale_product (INil))))
-        (= ?fused_scale (Op (CudaBinaryElementwise
-            "Mul"
-            ?out_shape
-            ?cast_strides
-            ?scale_strides
-            ?scaled_out_strides
-            (F32))
-            (ICons ?fs_cast (ICons ?fs_scale (INil)))))
-        (= ?cast_strides ?scaled_out_strides)
-    )
-    (
-        (let ?sgemm (Op (cublaslt_scaled
-            ?cm ?cn ?ck
-            ?cta ?ctb
-            ?cao ?cbo ?cco ?cdo
-            ?clda ?cldb ?cldc ?cldd
-            ?cbc ?csa ?csb ?csc ?csd
-            ?cadt ?cbdt ?ccdt ?cddt ?ccompute ?cscale ?calpha ?cbeta ?cepilogue)
-            (ICons ?a (ICons ?b (ICons ?a_scale (ICons ?b_scale (INil)))))))
-        (let ?fs_sgemm (Op (FusionStart ?out_shape ?scaled_out_strides (F32))
-            (ICons ?sgemm (INil))))
-        (union ?fused_scale ?fs_sgemm)
-        (set (dtype ?sgemm) (F32))
-        (set (dtype ?fs_sgemm) (F32))
-    )
-    :ruleset fusion_grow
-    :name "cublaslt scaled fp8 fused output-scale f32 output"
-)
-
-(rule
-    (
-        (= ?raw_gemm (Op (cublaslt
-            ?cm ?cn ?ck
-            ?cta ?ctb
-            ?cao ?cbo ?cco ?cdo
-            ?clda ?cldb ?cldc ?cldd
-            ?cbc ?csa ?csb ?csc ?csd
-            ?cadt ?cbdt ?ccdt ?cddt ?ccompute ?cscale ?calpha ?cbeta ?cepilogue)
-            (ICons ?a (ICons ?b (INil)))))
-        (cublaslt_fp8_f32_output_pair ?cadt ?cbdt)
-        (= ?ccdt (F32))
-        (= ?cddt (F32))
-        (= ?cbeta 0.0)
-        (= ?cepilogue "DEFAULT")
-
-        (= ?fs_cast (Op (FusionStart
-            ?out_shape
-            ?cast_strides
-            (F32))
-            (ICons ?raw_gemm (INil))))
-
-        (= ?out_shape (ECons ?out_m (ECons ?out_n (ENil))))
-        (= ?scale_strides (ECons (MNum 0) (ECons (MNum 0) (ENil))))
-
-        (= ?fs_a_scale (Op (FusionStart (ENil) (ENil) (F32))
-            (ICons ?a_scale (INil))))
-        (= ?fs_b_scale (Op (FusionStart (ENil) (ENil) (F32))
-            (ICons ?b_scale (INil))))
-        (= ?scale_product_inner (Op (CudaBinaryElementwise
-            "Mul"
-            (ENil)
-            (ENil)
-            (ENil)
-            (ENil)
-            (F32))
-            (ICons ?fs_a_scale (ICons ?fs_b_scale (INil)))))
-        (= ?scale_product (Op (FusionEnd (ENil) (ENil) (F32))
-            (ICons ?scale_product_inner (INil))))
-        (= ?fs_scale (Op (FusionStart
-            ?out_shape
-            ?scale_strides
-            (F32))
-            (ICons ?scale_product (INil))))
-        (= ?fused_scale (Op (CudaBinaryElementwise
-            "Mul"
-            ?out_shape
-            ?cast_strides
-            ?scale_strides
-            ?scaled_out_strides
-            (F32))
-            (ICons ?fs_cast (ICons ?fs_scale (INil)))))
-        (= ?cast_strides ?scaled_out_strides)
-
-        (= ?sgemm (Op (cublaslt_scaled
-            ?cm ?cn ?ck
-            ?cta ?ctb
-            ?cao ?cbo ?cco ?cdo
-            ?clda ?cldb ?cldc ?cldd
-            ?cbc ?csa ?csb ?csc ?csd
-            ?cadt ?cbdt ?ccdt ?cddt ?ccompute ?cscale ?calpha ?cbeta ?cepilogue)
-            (ICons ?a (ICons ?b (ICons ?a_scale (ICons ?b_scale (INil)))))))
-        (= ?fused_scale (Op (FusionStart ?out_shape ?scaled_out_strides (F32))
-            (ICons ?sgemm (INil))))
-    )
-    (
-        (delete (Op (cublaslt
-            ?cm ?cn ?ck
-            ?cta ?ctb
-            ?cao ?cbo ?cco ?cdo
-            ?clda ?cldb ?cldc ?cldd
-            ?cbc ?csa ?csb ?csc ?csd
-            ?cadt ?cbdt ?ccdt ?cddt ?ccompute ?cscale ?calpha ?cbeta ?cepilogue)
-            (ICons ?a (ICons ?b (INil)))))
-        (delete (Op (CudaBinaryElementwise
-            "Mul"
-            ?out_shape
-            ?cast_strides
-            ?scale_strides
-            ?scaled_out_strides
-            (F32))
-            (ICons ?fs_cast (ICons ?fs_scale (INil)))))
-    )
-    :ruleset cleanup
-    :name "delete raw fp8 path when scaled cublaslt covers fused output scale"
-)
-
-(rule
-    (
-        ; Batched form of the scaled FP8 linear rewrite. The scale operands are
-        ; scalar tensors expanded across the last three output/activation axes.
-        (= ?scaled_activation (Op (Mul
-            ?activation_shape
-            ?raw_activation_strides
-            ?recip_activation_strides
-            ?activation_out_strides)
-            (ICons ?raw_activation (ICons ?recip_input_scale (INil)))))
-        (= ?recip_input_scale (Op (Recip
-            ?activation_shape
-            (ECons (MNum 0) (ECons (MNum 0) (ECons (MNum 0) (ENil))))
-            ?recip_out_strides)
-            (ICons ?input_scale (INil))))
-        (= ?a (Op (Cast ?a_size ?a_dtype) (ICons ?scaled_activation (INil))))
-
-        (= ?mul (Op (Mul ?mul_shape ?a_stride ?b_stride ?mul_out_stride) (ICons ?a (ICons ?b (INil)))))
-        (= ?sum (Op (Sum ?out_shape ?k ?sum_in_stride ?k_stride ?sum_out_stride) (ICons ?mul (INil))))
-        (= ?cast (Op (Cast ?size (F32)) (ICons ?sum (INil))))
-        (= ?scale_product (Op (Mul (ENil) (ENil) (ENil) (ENil))
-            (ICons ?input_scale (ICons ?weight_scale (INil)))))
-        (= ?scaled (Op (Mul
-            ?out_shape
-            ?cast_strides
-            (ECons (MNum 0) (ECons (MNum 0) (ECons (MNum 0) (ENil))))
-            ?scaled_out_strides)
-            (ICons ?cast (ICons ?scale_product (INil)))))
-        (= ?cast_strides ?scaled_out_strides)
-
-        (= ?batch (nth_from_end ?out_shape 2))
-        (= ?m (nth_from_end ?out_shape 1))
-        (= ?n (nth_from_end ?out_shape 0))
-        (!= ?m (MNum 0))
-        (!= ?n (MNum 0))
-        (!= ?k (MNum 1))
-        (!= ?batch (MNum 0))
-
-        (= ?a_batch_stride (nth_from_end ?a_stride 3))
-        (= ?a_m_stride (nth_from_end ?a_stride 2))
-        (= ?a_n_stride (nth_from_end ?a_stride 1))
-        (= ?a_k_stride (nth_from_end ?a_stride 0))
-
-        (= ?b_batch_stride (nth_from_end ?b_stride 3))
-        (= ?b_m_stride (nth_from_end ?b_stride 2))
-        (= ?b_n_stride (nth_from_end ?b_stride 1))
-        (= ?b_k_stride (nth_from_end ?b_stride 0))
-
-        (= ?k_stride (MIter))
-
-        (= ?a_k_stride (MIter))
-        (= ?a_n_stride (MNum 0))
-        (= ?a_m_stride (MMul (MIter) ?k))
-
-        (= ?b_k_stride (MIter))
-        (= ?b_m_stride (MNum 0))
-        (= ?b_n_stride (MMul (MIter) ?k))
-
-        (= ?a_batch_stride (MMul ?m ?a_m_stride))
-        (= ?b_batch_stride (MMul ?n ?b_n_stride))
-
-        (= ?b_dtype (dtype ?b))
-        (cublaslt_fp8_f32_output_pair ?a_dtype ?b_dtype)
-    )
-    (
-        (let ?sgemm (Op (cublaslt_scaled
-            ?n ?m ?k
-            "T" "N"
-            "COL" "COL" "COL" "COL"
-            ?b_n_stride
-            ?a_m_stride
-            ?n
-            ?n
-            ?batch
-            ?b_batch_stride
-            ?a_batch_stride
-            (MMul ?m ?n)
-            (MMul ?m ?n)
-            ?b_dtype ?a_dtype (F32) (F32) "32F" "F32" 1.0 0.0 "DEFAULT")
-            (ICons ?b (ICons ?a (ICons ?weight_scale (ICons ?input_scale (INil)))))))
-        (union ?scaled ?sgemm)
-        (set (dtype ?sgemm) (F32))
-    )
-:ruleset matmul_backend
-:name "cublaslt scaled fp8 batched row-major x column-major f32 output"
-)
-
-(rule
-    (
-        (= ?mul (Op (Mul ?mul_shape ?a_stride ?b_stride ?mul_out_stride) (ICons ?a (ICons ?b (INil)))))
-        (= ?sum (Op (Sum ?out_shape ?k ?sum_in_stride ?k_stride ?sum_out_stride) (ICons ?mul (INil))))
-        (= ?cast (Op (Cast ?size (F32)) (ICons ?sum (INil))))
-
-        (= ?out_shape (ECons ?m (ECons ?n (ENil))))
-        (!= ?m (MNum 0))
-        (!= ?n (MNum 0))
-        (!= ?k (MNum 1))
-
-        (= ?a_stride (ECons ?a_m_stride (ECons ?a_n_stride (ECons ?a_k_stride (ENil)))))
-        (= ?b_stride (ECons ?b_m_stride (ECons ?b_n_stride (ECons ?b_k_stride (ENil)))))
-        (= ?k_stride (MIter))
-
-        (= ?a_m_stride (MMul (MIter) ?k))
-        (= ?a_n_stride (MNum 0))
-        (= ?a_k_stride (MIter))
-
-        (= ?b_m_stride (MNum 0))
-        (= ?b_n_stride (MMul (MIter) ?k))
-        (= ?b_k_stride (MIter))
-
-        (= (F8E4M3) (dtype ?a))
-        (= (F8E4M3) (dtype ?b))
-    )
-    (
-        (let ?sgemm (Op (cublaslt
-            ?n ?m ?k
-            "T" "N"
-            "COL" "COL" "COL" "COL"
-            ?b_n_stride
-            ?a_m_stride
-            ?n
-            ?n
-            (MNum 1)
-            (MNum 0)
-            (MNum 0)
-            (MNum 0)
-            (MNum 0)
-            (F8E4M3) (F8E4M3) (F32) (F32) "32F" "F32" 1.0 0.0 "DEFAULT")
-            (ICons ?b (ICons ?a (INil)))))
-        (union ?cast ?sgemm)
-        (set (dtype ?sgemm) (F32))
-    )
-    :ruleset matmul_backend
-    :name "cublaslt fp8 e4m3/e4m3 row-major x column-major f32 output"
-)
-
-(rule
-    (
-        (= ?mul (Op (Mul ?mul_shape ?a_stride ?b_stride ?mul_out_stride) (ICons ?a (ICons ?b (INil)))))
-        (= ?sum (Op (Sum ?out_shape ?k ?sum_in_stride ?k_stride ?sum_out_stride) (ICons ?mul (INil))))
-        (= ?cast (Op (Cast ?size (F32)) (ICons ?sum (INil))))
-
-        (= ?out_shape (ECons ?m (ECons ?n (ENil))))
-        (!= ?m (MNum 0))
-        (!= ?n (MNum 0))
-        (!= ?k (MNum 1))
-
-        (= ?a_stride (ECons ?a_m_stride (ECons ?a_n_stride (ECons ?a_k_stride (ENil)))))
-        (= ?b_stride (ECons ?b_m_stride (ECons ?b_n_stride (ECons ?b_k_stride (ENil)))))
-        (= ?k_stride (MIter))
-
-        (= ?a_m_stride (MMul (MIter) ?k))
-        (= ?a_n_stride (MNum 0))
-        (= ?a_k_stride (MIter))
-
-        (= ?b_m_stride (MNum 0))
-        (= ?b_n_stride (MMul (MIter) ?k))
-        (= ?b_k_stride (MIter))
-
-        (= (F8E4M3) (dtype ?a))
-        (= (F8E5M2) (dtype ?b))
-    )
-    (
-        (let ?sgemm (Op (cublaslt
-            ?n ?m ?k
-            "T" "N"
-            "COL" "COL" "COL" "COL"
-            ?b_n_stride
-            ?a_m_stride
-            ?n
-            ?n
-            (MNum 1)
-            (MNum 0)
-            (MNum 0)
-            (MNum 0)
-            (MNum 0)
-            (F8E5M2) (F8E4M3) (F32) (F32) "32F" "F32" 1.0 0.0 "DEFAULT")
-            (ICons ?b (ICons ?a (INil)))))
-        (union ?cast ?sgemm)
-        (set (dtype ?sgemm) (F32))
-    )
-    :ruleset matmul_backend
-    :name "cublaslt fp8 e5m2/e4m3 row-major x column-major f32 output"
-)
-
-(rule
-    (
-        (= ?mul (Op (Mul ?mul_shape ?a_stride ?b_stride ?mul_out_stride) (ICons ?a (ICons ?b (INil)))))
-        (= ?sum (Op (Sum ?out_shape ?k ?sum_in_stride ?k_stride ?sum_out_stride) (ICons ?mul (INil))))
-        (= ?cast (Op (Cast ?size (F32)) (ICons ?sum (INil))))
-
-        (= ?out_shape (ECons ?m (ECons ?n (ENil))))
-        (!= ?m (MNum 0))
-        (!= ?n (MNum 0))
-        (!= ?k (MNum 1))
-
-        (= ?a_stride (ECons ?a_m_stride (ECons ?a_n_stride (ECons ?a_k_stride (ENil)))))
-        (= ?b_stride (ECons ?b_m_stride (ECons ?b_n_stride (ECons ?b_k_stride (ENil)))))
-        (= ?k_stride (MIter))
-
-        (= ?a_m_stride (MMul (MIter) ?k))
-        (= ?a_n_stride (MNum 0))
-        (= ?a_k_stride (MIter))
-
-        (= ?b_m_stride (MNum 0))
-        (= ?b_n_stride (MMul (MIter) ?k))
-        (= ?b_k_stride (MIter))
-
-        (= (F8E5M2) (dtype ?a))
-        (= (F8E4M3) (dtype ?b))
-    )
-    (
-        (let ?sgemm (Op (cublaslt
-            ?n ?m ?k
-            "T" "N"
-            "COL" "COL" "COL" "COL"
-            ?b_n_stride
-            ?a_m_stride
-            ?n
-            ?n
-            (MNum 1)
-            (MNum 0)
-            (MNum 0)
-            (MNum 0)
-            (MNum 0)
-            (F8E4M3) (F8E5M2) (F32) (F32) "32F" "F32" 1.0 0.0 "DEFAULT")
-            (ICons ?b (ICons ?a (INil)))))
-        (union ?cast ?sgemm)
-        (set (dtype ?sgemm) (F32))
-    )
-    :ruleset matmul_backend
-    :name "cublaslt fp8 e4m3/e5m2 row-major x column-major f32 output"
-)
-
-(rule
-    (
-        (= ?mul (Op (Mul ?mul_shape ?a_stride ?b_stride ?mul_out_stride) (ICons ?a (ICons ?b (INil)))))
-        (= ?sum (Op (Sum ?out_shape ?k ?sum_in_stride ?k_stride ?sum_out_stride) (ICons ?mul (INil))))
-        (= ?cast (Op (Cast ?size (F32)) (ICons ?sum (INil))))
-
-        (= ?batch (nth_from_end ?out_shape 2))
-        (= ?m (nth_from_end ?out_shape 1))
-        (= ?n (nth_from_end ?out_shape 0))
-        (!= ?m (MNum 0))
-        (!= ?n (MNum 0))
-        (!= ?k (MNum 1))
-        (!= ?batch (MNum 0))
-
-        (= ?a_batch_stride (nth_from_end ?a_stride 3))
-        (= ?a_m_stride (nth_from_end ?a_stride 2))
-        (= ?a_n_stride (nth_from_end ?a_stride 1))
-        (= ?a_k_stride (nth_from_end ?a_stride 0))
-
-        (= ?b_batch_stride (nth_from_end ?b_stride 3))
-        (= ?b_m_stride (nth_from_end ?b_stride 2))
-        (= ?b_n_stride (nth_from_end ?b_stride 1))
-        (= ?b_k_stride (nth_from_end ?b_stride 0))
-
-        (= ?k_stride (MIter))
-
-        (= ?a_k_stride (MIter))
-        (= ?a_n_stride (MNum 0))
-        (= ?a_m_stride (MMul (MIter) ?k))
-
-        (= ?b_k_stride (MIter))
-        (= ?b_m_stride (MNum 0))
-        (= ?b_n_stride (MMul (MIter) ?k))
-
-        (= ?a_batch_stride (MMul ?m ?a_m_stride))
-        (= ?b_batch_stride (MMul ?n ?b_n_stride))
-
-        (= (F8E4M3) (dtype ?a))
-        (= (F8E4M3) (dtype ?b))
-    )
-    (
-        (let ?sgemm (Op (cublaslt
-            ?n ?m ?k
-            "T" "N"
-            "COL" "COL" "COL" "COL"
-            ?b_n_stride
-            ?a_m_stride
-            ?n
-            ?n
-            ?batch
-            ?b_batch_stride
-            ?a_batch_stride
-            (MMul ?m ?n)
-            (MMul ?m ?n)
-            (F8E4M3) (F8E4M3) (F32) (F32) "32F" "F32" 1.0 0.0 "DEFAULT")
-            (ICons ?b (ICons ?a (INil)))))
-        (union ?cast ?sgemm)
-        (set (dtype ?sgemm) (F32))
-    )
-    :ruleset matmul_backend
-    :name "cublaslt fp8 e4m3/e4m3 batched row-major x column-major f32 output"
-)
-
-(rule
-    (
-        (= ?mul (Op (Mul ?mul_shape ?a_stride ?b_stride ?mul_out_stride) (ICons ?a (ICons ?b (INil)))))
-        (= ?sum (Op (Sum ?out_shape ?k ?sum_in_stride ?k_stride ?sum_out_stride) (ICons ?mul (INil))))
-        (= ?cast (Op (Cast ?size (F32)) (ICons ?sum (INil))))
-
-        (= ?batch (nth_from_end ?out_shape 2))
-        (= ?m (nth_from_end ?out_shape 1))
-        (= ?n (nth_from_end ?out_shape 0))
-        (!= ?m (MNum 0))
-        (!= ?n (MNum 0))
-        (!= ?k (MNum 1))
-        (!= ?batch (MNum 0))
-
-        (= ?a_batch_stride (nth_from_end ?a_stride 3))
-        (= ?a_m_stride (nth_from_end ?a_stride 2))
-        (= ?a_n_stride (nth_from_end ?a_stride 1))
-        (= ?a_k_stride (nth_from_end ?a_stride 0))
-
-        (= ?b_batch_stride (nth_from_end ?b_stride 3))
-        (= ?b_m_stride (nth_from_end ?b_stride 2))
-        (= ?b_n_stride (nth_from_end ?b_stride 1))
-        (= ?b_k_stride (nth_from_end ?b_stride 0))
-
-        (= ?k_stride (MIter))
-
-        (= ?a_k_stride (MIter))
-        (= ?a_n_stride (MNum 0))
-        (= ?a_m_stride (MMul (MIter) ?k))
-
-        (= ?b_k_stride (MIter))
-        (= ?b_m_stride (MNum 0))
-        (= ?b_n_stride (MMul (MIter) ?k))
-
-        (= ?a_batch_stride (MMul ?m ?a_m_stride))
-        (= ?b_batch_stride (MMul ?n ?b_n_stride))
-
-        (= (F8E4M3) (dtype ?a))
-        (= (F8E5M2) (dtype ?b))
-    )
-    (
-        (let ?sgemm (Op (cublaslt
-            ?n ?m ?k
-            "T" "N"
-            "COL" "COL" "COL" "COL"
-            ?b_n_stride
-            ?a_m_stride
-            ?n
-            ?n
-            ?batch
-            ?b_batch_stride
-            ?a_batch_stride
-            (MMul ?m ?n)
-            (MMul ?m ?n)
-            (F8E5M2) (F8E4M3) (F32) (F32) "32F" "F32" 1.0 0.0 "DEFAULT")
-            (ICons ?b (ICons ?a (INil)))))
-        (union ?cast ?sgemm)
-        (set (dtype ?sgemm) (F32))
-    )
-    :ruleset matmul_backend
-    :name "cublaslt fp8 e5m2/e4m3 batched row-major x column-major f32 output"
-)
-
-(rule
-    (
-        (= ?mul (Op (Mul ?mul_shape ?a_stride ?b_stride ?mul_out_stride) (ICons ?a (ICons ?b (INil)))))
-        (= ?sum (Op (Sum ?out_shape ?k ?sum_in_stride ?k_stride ?sum_out_stride) (ICons ?mul (INil))))
-        (= ?cast (Op (Cast ?size (F32)) (ICons ?sum (INil))))
-
-        (= ?batch (nth_from_end ?out_shape 2))
-        (= ?m (nth_from_end ?out_shape 1))
-        (= ?n (nth_from_end ?out_shape 0))
-        (!= ?m (MNum 0))
-        (!= ?n (MNum 0))
-        (!= ?k (MNum 1))
-        (!= ?batch (MNum 0))
-
-        (= ?a_batch_stride (nth_from_end ?a_stride 3))
-        (= ?a_m_stride (nth_from_end ?a_stride 2))
-        (= ?a_n_stride (nth_from_end ?a_stride 1))
-        (= ?a_k_stride (nth_from_end ?a_stride 0))
-
-        (= ?b_batch_stride (nth_from_end ?b_stride 3))
-        (= ?b_m_stride (nth_from_end ?b_stride 2))
-        (= ?b_n_stride (nth_from_end ?b_stride 1))
-        (= ?b_k_stride (nth_from_end ?b_stride 0))
-
-        (= ?k_stride (MIter))
-
-        (= ?a_k_stride (MIter))
-        (= ?a_n_stride (MNum 0))
-        (= ?a_m_stride (MMul (MIter) ?k))
-
-        (= ?b_k_stride (MIter))
-        (= ?b_m_stride (MNum 0))
-        (= ?b_n_stride (MMul (MIter) ?k))
-
-        (= ?a_batch_stride (MMul ?m ?a_m_stride))
-        (= ?b_batch_stride (MMul ?n ?b_n_stride))
-
-        (= (F8E5M2) (dtype ?a))
-        (= (F8E4M3) (dtype ?b))
-    )
-    (
-        (let ?sgemm (Op (cublaslt
-            ?n ?m ?k
-            "T" "N"
-            "COL" "COL" "COL" "COL"
-            ?b_n_stride
-            ?a_m_stride
-            ?n
-            ?n
-            ?batch
-            ?b_batch_stride
-            ?a_batch_stride
-            (MMul ?m ?n)
-            (MMul ?m ?n)
-            (F8E4M3) (F8E5M2) (F32) (F32) "32F" "F32" 1.0 0.0 "DEFAULT")
-            (ICons ?b (ICons ?a (INil)))))
-        (union ?cast ?sgemm)
-        (set (dtype ?sgemm) (F32))
-    )
-    :ruleset matmul_backend
-    :name "cublaslt fp8 e4m3/e5m2 batched row-major x column-major f32 output"
-)
--- a/crates/luminal_cuda_lite/src/host/cublaslt/cublaslt_mixed_dtype_rewrite.egg
+++ b/crates/luminal_cuda_lite/src/host/cublaslt/cublaslt_mixed_dtype_rewrite.egg
@@ -1,75 +0,0 @@
-; Mixed output dtype rewrites for cuBLASLt.
-;
-; The first mixed mode we need for low-precision matmuls is:
-;
-;   D[f32] = A[fp16/bf16] * B[fp16/bf16]
-;
-; Luminal graphs express this today as a Cast(F32) around a low-precision
-; matmul. cuBLASLt can write the f32 output directly, so expose that candidate
-; before beta fusion tries to consume an f32 C input.
-
-(rule
-    (
-        (= ?matmul (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order ?c_order ?d_order
-            ?lda ?ldb ?ldc ?ldd
-            ?batch
-            ?stride_a ?stride_b ?stride_c ?stride_d
-            (F16) (F16) (F16) (F16)
-            ?compute_type ?scale_dtype
-            ?alpha ?beta ?epilogue)
-            ?inputs))
-        (= ?cast (Op (Cast ?size (F32)) (ICons ?matmul (INil))))
-    )
-    (
-        (let ?fused (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout ?a_order ?b_order ?c_order ?d_order
-            ?lda ?ldb ?ldc ?ldd
-            ?batch
-            ?stride_a ?stride_b ?stride_c ?stride_d
-            (F16) (F16) (F32) (F32)
-            ?compute_type ?scale_dtype
-            ?alpha ?beta ?epilogue)
-            ?inputs))
-        (union ?cast ?fused)
-        (set (dtype ?fused) (F32))
-    )
-    :ruleset matmul_backend
-    :name "cublaslt f16 matmul cast f32 output"
-)
-
-(rule
-    (
-        (= ?matmul (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order ?c_order ?d_order
-            ?lda ?ldb ?ldc ?ldd
-            ?batch
-            ?stride_a ?stride_b ?stride_c ?stride_d
-            (Bf16) (Bf16) (Bf16) (Bf16)
-            ?compute_type ?scale_dtype
-            ?alpha ?beta ?epilogue)
-            ?inputs))
-        (= ?cast (Op (Cast ?size (F32)) (ICons ?matmul (INil))))
-    )
-    (
-        (let ?fused (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout ?a_order ?b_order ?c_order ?d_order
-            ?lda ?ldb ?ldc ?ldd
-            ?batch
-            ?stride_a ?stride_b ?stride_c ?stride_d
-            (Bf16) (Bf16) (F32) (F32)
-            ?compute_type ?scale_dtype
-            ?alpha ?beta ?epilogue)
-            ?inputs))
-        (union ?cast ?fused)
-        (set (dtype ?fused) (F32))
-    )
-    :ruleset matmul_backend
-    :name "cublaslt bf16 matmul cast f32 output"
-)
--- a/crates/luminal_cuda_lite/src/host/cublaslt/cublaslt_row_order_rewrite.egg
+++ b/crates/luminal_cuda_lite/src/host/cublaslt/cublaslt_row_order_rewrite.egg
@@ -1,452 +0,0 @@
-; Natural cuBLASLt row-order output rewrites. These keep Luminal's logical
-; output C[m,n] as a cuBLASLt ROW-ordered D[m,n] instead of using the older
-; swapped COL-ordered D[n,m] view. A and B orders mirror their matched logical
-; layouts, so this family is the legal base for future ROW-ordered beta fusions.
-
-(rule
-    (
-        (= ?mul (Op (Mul ?mul_shape ?a_stride ?b_stride ?mul_out_stride) (ICons ?a (ICons ?b (INil)))))
-        (= ?sum (Op (Sum ?out_shape ?k ?sum_in_stride ?k_stride ?sum_out_stride) (ICons ?mul (INil))))
-
-        (= ?out_shape (ECons ?m (ECons ?n (ENil))))
-        (!= ?m (MNum 0))
-        (!= ?n (MNum 0))
-        (!= ?k (MNum 1))
-
-        (= ?a_stride (ECons ?a_m_stride (ECons ?a_n_stride (ECons ?a_k_stride (ENil)))))
-        (= ?b_stride (ECons ?b_m_stride (ECons ?b_n_stride (ECons ?b_k_stride (ENil)))))
-        (= ?k_stride (MIter))
-
-        (= ?a_m_stride (MMul (MIter) ?k))
-        (= ?a_n_stride (MNum 0))
-        (= ?a_k_stride (MIter))
-
-        (= ?b_m_stride (MNum 0))
-        (= ?b_n_stride (MIter))
-        (= ?b_k_stride (MMul (MIter) ?n))
-
-        (= ?dt (dtype ?a))
-        (= ?dt (dtype ?b))
-        (cublaslt_base_dtype ?dt)
-    )
-    (
-        (let ?sgemm (Op (cublaslt
-            ?m ?n ?k
-            "N" "N"
-            "ROW" "ROW" "ROW" "ROW"
-            ?a_m_stride
-            ?b_k_stride
-            ?n
-            ?n
-            (MNum 1)
-            (MNum 0)
-            (MNum 0)
-            (MNum 0)
-            (MNum 0)
-            ?dt ?dt ?dt ?dt "default" "default" 1.0 0.0 "DEFAULT")
-            (ICons ?a (ICons ?b (INil)))))
-        (union ?sum ?sgemm)
-        (set (dtype ?sgemm) ?dt)
-    )
-    :ruleset matmul_backend
-    :name "cublaslt row-order row-major x row-major"
-)
-
-(rule
-    (
-        (= ?mul (Op (Mul ?mul_shape ?a_stride ?b_stride ?mul_out_stride) (ICons ?a (ICons ?b (INil)))))
-        (= ?sum (Op (Sum ?out_shape ?k ?sum_in_stride ?k_stride ?sum_out_stride) (ICons ?mul (INil))))
-
-        (= ?out_shape (ECons ?m (ECons ?n (ENil))))
-        (!= ?m (MNum 0))
-        (!= ?n (MNum 0))
-        (!= ?k (MNum 1))
-
-        (= ?a_stride (ECons ?a_m_stride (ECons ?a_n_stride (ECons ?a_k_stride (ENil)))))
-        (= ?b_stride (ECons ?b_m_stride (ECons ?b_n_stride (ECons ?b_k_stride (ENil)))))
-        (= ?k_stride (MIter))
-
-        (= ?a_m_stride (MMul (MIter) ?k))
-        (= ?a_n_stride (MNum 0))
-        (= ?a_k_stride (MIter))
-
-        (= ?b_m_stride (MNum 0))
-        (= ?b_n_stride (MMul (MIter) ?k))
-        (= ?b_k_stride (MIter))
-
-        (= ?dt (dtype ?a))
-        (= ?dt (dtype ?b))
-        (cublaslt_base_dtype ?dt)
-    )
-    (
-        (let ?sgemm (Op (cublaslt
-            ?m ?n ?k
-            "N" "N"
-            "ROW" "COL" "ROW" "ROW"
-            ?a_m_stride
-            ?b_n_stride
-            ?n
-            ?n
-            (MNum 1)
-            (MNum 0)
-            (MNum 0)
-            (MNum 0)
-            (MNum 0)
-            ?dt ?dt ?dt ?dt "default" "default" 1.0 0.0 "DEFAULT")
-            (ICons ?a (ICons ?b (INil)))))
-        (union ?sum ?sgemm)
-        (set (dtype ?sgemm) ?dt)
-    )
-    :ruleset matmul_backend
-    :name "cublaslt row-order row-major x column-major"
-)
-
-(rule
-    (
-        (= ?mul (Op (Mul ?mul_shape ?a_stride ?b_stride ?mul_out_stride) (ICons ?a (ICons ?b (INil)))))
-        (= ?sum (Op (Sum ?out_shape ?k ?sum_in_stride ?k_stride ?sum_out_stride) (ICons ?mul (INil))))
-
-        (= ?out_shape (ECons ?m (ECons ?n (ENil))))
-        (!= ?m (MNum 0))
-        (!= ?n (MNum 0))
-        (!= ?k (MNum 1))
-
-        (= ?a_stride (ECons ?a_m_stride (ECons ?a_n_stride (ECons ?a_k_stride (ENil)))))
-        (= ?b_stride (ECons ?b_m_stride (ECons ?b_n_stride (ECons ?b_k_stride (ENil)))))
-        (= ?k_stride (MIter))
-
-        (= ?a_m_stride (MIter))
-        (= ?a_n_stride (MNum 0))
-        (= ?a_k_stride (MMul (MIter) ?m))
-
-        (= ?b_m_stride (MNum 0))
-        (= ?b_n_stride (MIter))
-        (= ?b_k_stride (MMul (MIter) ?n))
-
-        (= ?dt (dtype ?a))
-        (= ?dt (dtype ?b))
-        (cublaslt_base_dtype ?dt)
-    )
-    (
-        (let ?sgemm (Op (cublaslt
-            ?m ?n ?k
-            "N" "N"
-            "COL" "ROW" "ROW" "ROW"
-            ?a_k_stride
-            ?b_k_stride
-            ?n
-            ?n
-            (MNum 1)
-            (MNum 0)
-            (MNum 0)
-            (MNum 0)
-            (MNum 0)
-            ?dt ?dt ?dt ?dt "default" "default" 1.0 0.0 "DEFAULT")
-            (ICons ?a (ICons ?b (INil)))))
-        (union ?sum ?sgemm)
-        (set (dtype ?sgemm) ?dt)
-    )
-    :ruleset matmul_backend
-    :name "cublaslt row-order column-major x row-major"
-)
-
-(rule
-    (
-        (= ?mul (Op (Mul ?mul_shape ?a_stride ?b_stride ?mul_out_stride) (ICons ?a (ICons ?b (INil)))))
-        (= ?sum (Op (Sum ?out_shape ?k ?sum_in_stride ?k_stride ?sum_out_stride) (ICons ?mul (INil))))
-
-        (= ?out_shape (ECons ?m (ECons ?n (ENil))))
-        (!= ?m (MNum 0))
-        (!= ?n (MNum 0))
-        (!= ?k (MNum 1))
-
-        (= ?a_stride (ECons ?a_m_stride (ECons ?a_n_stride (ECons ?a_k_stride (ENil)))))
-        (= ?b_stride (ECons ?b_m_stride (ECons ?b_n_stride (ECons ?b_k_stride (ENil)))))
-        (= ?k_stride (MIter))
-
-        (= ?a_m_stride (MIter))
-        (= ?a_n_stride (MNum 0))
-        (= ?a_k_stride (MMul (MIter) ?m))
-
-        (= ?b_m_stride (MNum 0))
-        (= ?b_n_stride (MMul (MIter) ?k))
-        (= ?b_k_stride (MIter))
-
-        (= ?dt (dtype ?a))
-        (= ?dt (dtype ?b))
-        (cublaslt_base_dtype ?dt)
-    )
-    (
-        (let ?sgemm (Op (cublaslt
-            ?m ?n ?k
-            "N" "N"
-            "COL" "COL" "ROW" "ROW"
-            ?a_k_stride
-            ?b_n_stride
-            ?n
-            ?n
-            (MNum 1)
-            (MNum 0)
-            (MNum 0)
-            (MNum 0)
-            (MNum 0)
-            ?dt ?dt ?dt ?dt "default" "default" 1.0 0.0 "DEFAULT")
-            (ICons ?a (ICons ?b (INil)))))
-        (union ?sum ?sgemm)
-        (set (dtype ?sgemm) ?dt)
-    )
-    :ruleset matmul_backend
-    :name "cublaslt row-order column-major x column-major"
-)
-
-(rule
-    (
-        (= ?mul (Op (Mul ?mul_shape ?a_stride ?b_stride ?mul_out_stride) (ICons ?a (ICons ?b (INil)))))
-        (= ?sum (Op (Sum ?out_shape ?k ?sum_in_stride ?k_stride ?sum_out_stride) (ICons ?mul (INil))))
-
-        (= ?batch (nth_from_end ?out_shape 2))
-        (= ?m (nth_from_end ?out_shape 1))
-        (= ?n (nth_from_end ?out_shape 0))
-        (!= ?m (MNum 0))
-        (!= ?n (MNum 0))
-        (!= ?k (MNum 1))
-        (!= ?batch (MNum 0))
-
-        (= ?a_batch_stride (nth_from_end ?a_stride 3))
-        (= ?a_m_stride (nth_from_end ?a_stride 2))
-        (= ?a_n_stride (nth_from_end ?a_stride 1))
-        (= ?a_k_stride (nth_from_end ?a_stride 0))
-
-        (= ?b_batch_stride (nth_from_end ?b_stride 3))
-        (= ?b_m_stride (nth_from_end ?b_stride 2))
-        (= ?b_n_stride (nth_from_end ?b_stride 1))
-        (= ?b_k_stride (nth_from_end ?b_stride 0))
-
-        (= ?k_stride (MIter))
-
-        (= ?a_k_stride (MIter))
-        (= ?a_n_stride (MNum 0))
-        (= ?a_m_stride (MMul (MIter) ?k))
-
-        (= ?b_n_stride (MIter))
-        (= ?b_m_stride (MNum 0))
-        (= ?b_k_stride (MMul (MIter) ?n))
-
-        (= ?a_batch_stride (MMul ?m ?a_m_stride))
-        (= ?b_batch_stride (MMul ?k ?b_k_stride))
-
-        (= ?dt (dtype ?a))
-        (= ?dt (dtype ?b))
-        (cublaslt_base_dtype ?dt)
-    )
-    (
-        (let ?sgemm (Op (cublaslt
-            ?m ?n ?k
-            "N" "N"
-            "ROW" "ROW" "ROW" "ROW"
-            ?a_m_stride
-            ?b_k_stride
-            ?n
-            ?n
-            ?batch
-            ?a_batch_stride
-            ?b_batch_stride
-            (MMul ?m ?n)
-            (MMul ?m ?n)
-            ?dt ?dt ?dt ?dt "default" "default" 1.0 0.0 "DEFAULT")
-            (ICons ?a (ICons ?b (INil)))))
-        (union ?sum ?sgemm)
-        (set (dtype ?sgemm) ?dt)
-    )
-    :ruleset matmul_backend
-    :name "cublaslt row-order batched row-major x row-major"
-)
-
-(rule
-    (
-        (= ?mul (Op (Mul ?mul_shape ?a_stride ?b_stride ?mul_out_stride) (ICons ?a (ICons ?b (INil)))))
-        (= ?sum (Op (Sum ?out_shape ?k ?sum_in_stride ?k_stride ?sum_out_stride) (ICons ?mul (INil))))
-
-        (= ?batch (nth_from_end ?out_shape 2))
-        (= ?m (nth_from_end ?out_shape 1))
-        (= ?n (nth_from_end ?out_shape 0))
-        (!= ?m (MNum 0))
-        (!= ?n (MNum 0))
-        (!= ?k (MNum 1))
-        (!= ?batch (MNum 0))
-
-        (= ?a_batch_stride (nth_from_end ?a_stride 3))
-        (= ?a_m_stride (nth_from_end ?a_stride 2))
-        (= ?a_n_stride (nth_from_end ?a_stride 1))
-        (= ?a_k_stride (nth_from_end ?a_stride 0))
-
-        (= ?b_batch_stride (nth_from_end ?b_stride 3))
-        (= ?b_m_stride (nth_from_end ?b_stride 2))
-        (= ?b_n_stride (nth_from_end ?b_stride 1))
-        (= ?b_k_stride (nth_from_end ?b_stride 0))
-
-        (= ?k_stride (MIter))
-
-        (= ?a_k_stride (MIter))
-        (= ?a_n_stride (MNum 0))
-        (= ?a_m_stride (MMul (MIter) ?k))
-
-        (= ?b_k_stride (MIter))
-        (= ?b_m_stride (MNum 0))
-        (= ?b_n_stride (MMul (MIter) ?k))
-
-        (= ?a_batch_stride (MMul ?m ?a_m_stride))
-        (= ?b_batch_stride (MMul ?n ?b_n_stride))
-
-        (= ?dt (dtype ?a))
-        (= ?dt (dtype ?b))
-        (cublaslt_base_dtype ?dt)
-    )
-    (
-        (let ?sgemm (Op (cublaslt
-            ?m ?n ?k
-            "N" "N"
-            "ROW" "COL" "ROW" "ROW"
-            ?a_m_stride
-            ?b_n_stride
-            ?n
-            ?n
-            ?batch
-            ?a_batch_stride
-            ?b_batch_stride
-            (MMul ?m ?n)
-            (MMul ?m ?n)
-            ?dt ?dt ?dt ?dt "default" "default" 1.0 0.0 "DEFAULT")
-            (ICons ?a (ICons ?b (INil)))))
-        (union ?sum ?sgemm)
-        (set (dtype ?sgemm) ?dt)
-    )
-    :ruleset matmul_backend
-    :name "cublaslt row-order batched row-major x column-major"
-)
-
-(rule
-    (
-        (= ?mul (Op (Mul ?mul_shape ?a_stride ?b_stride ?mul_out_stride) (ICons ?a (ICons ?b (INil)))))
-        (= ?sum (Op (Sum ?out_shape ?k ?sum_in_stride ?k_stride ?sum_out_stride) (ICons ?mul (INil))))
-
-        (= ?batch (nth_from_end ?out_shape 2))
-        (= ?m (nth_from_end ?out_shape 1))
-        (= ?n (nth_from_end ?out_shape 0))
-        (!= ?m (MNum 0))
-        (!= ?n (MNum 0))
-        (!= ?k (MNum 1))
-        (!= ?batch (MNum 0))
-
-        (= ?a_batch_stride (nth_from_end ?a_stride 3))
-        (= ?a_m_stride (nth_from_end ?a_stride 2))
-        (= ?a_n_stride (nth_from_end ?a_stride 1))
-        (= ?a_k_stride (nth_from_end ?a_stride 0))
-
-        (= ?b_batch_stride (nth_from_end ?b_stride 3))
-        (= ?b_m_stride (nth_from_end ?b_stride 2))
-        (= ?b_n_stride (nth_from_end ?b_stride 1))
-        (= ?b_k_stride (nth_from_end ?b_stride 0))
-
-        (= ?k_stride (MIter))
-
-        (= ?a_m_stride (MIter))
-        (= ?a_n_stride (MNum 0))
-        (= ?a_k_stride (MMul (MIter) ?m))
-
-        (= ?b_n_stride (MIter))
-        (= ?b_m_stride (MNum 0))
-        (= ?b_k_stride (MMul (MIter) ?n))
-
-        (= ?a_batch_stride (MMul ?k ?a_k_stride))
-        (= ?b_batch_stride (MMul ?k ?b_k_stride))
-
-        (= ?dt (dtype ?a))
-        (= ?dt (dtype ?b))
-        (cublaslt_base_dtype ?dt)
-    )
-    (
-        (let ?sgemm (Op (cublaslt
-            ?m ?n ?k
-            "N" "N"
-            "COL" "ROW" "ROW" "ROW"
-            ?a_k_stride
-            ?b_k_stride
-            ?n
-            ?n
-            ?batch
-            ?a_batch_stride
-            ?b_batch_stride
-            (MMul ?m ?n)
-            (MMul ?m ?n)
-            ?dt ?dt ?dt ?dt "default" "default" 1.0 0.0 "DEFAULT")
-            (ICons ?a (ICons ?b (INil)))))
-        (union ?sum ?sgemm)
-        (set (dtype ?sgemm) ?dt)
-    )
-    :ruleset matmul_backend
-    :name "cublaslt row-order batched column-major x row-major"
-)
-
-(rule
-    (
-        (= ?mul (Op (Mul ?mul_shape ?a_stride ?b_stride ?mul_out_stride) (ICons ?a (ICons ?b (INil)))))
-        (= ?sum (Op (Sum ?out_shape ?k ?sum_in_stride ?k_stride ?sum_out_stride) (ICons ?mul (INil))))
-
-        (= ?batch (nth_from_end ?out_shape 2))
-        (= ?m (nth_from_end ?out_shape 1))
-        (= ?n (nth_from_end ?out_shape 0))
-        (!= ?m (MNum 0))
-        (!= ?n (MNum 0))
-        (!= ?k (MNum 1))
-        (!= ?batch (MNum 0))
-
-        (= ?a_batch_stride (nth_from_end ?a_stride 3))
-        (= ?a_m_stride (nth_from_end ?a_stride 2))
-        (= ?a_n_stride (nth_from_end ?a_stride 1))
-        (= ?a_k_stride (nth_from_end ?a_stride 0))
-
-        (= ?b_batch_stride (nth_from_end ?b_stride 3))
-        (= ?b_m_stride (nth_from_end ?b_stride 2))
-        (= ?b_n_stride (nth_from_end ?b_stride 1))
-        (= ?b_k_stride (nth_from_end ?b_stride 0))
-
-        (= ?k_stride (MIter))
-
-        (= ?a_m_stride (MIter))
-        (= ?a_n_stride (MNum 0))
-        (= ?a_k_stride (MMul (MIter) ?m))
-
-        (= ?b_k_stride (MIter))
-        (= ?b_m_stride (MNum 0))
-        (= ?b_n_stride (MMul (MIter) ?k))
-
-        (= ?a_batch_stride (MMul ?k ?a_k_stride))
-        (= ?b_batch_stride (MMul ?n ?b_n_stride))
-
-        (= ?dt (dtype ?a))
-        (= ?dt (dtype ?b))
-        (cublaslt_base_dtype ?dt)
-    )
-    (
-        (let ?sgemm (Op (cublaslt
-            ?m ?n ?k
-            "N" "N"
-            "COL" "COL" "ROW" "ROW"
-            ?a_k_stride
-            ?b_n_stride
-            ?n
-            ?n
-            ?batch
-            ?a_batch_stride
-            ?b_batch_stride
-            (MMul ?m ?n)
-            (MMul ?m ?n)
-            ?dt ?dt ?dt ?dt "default" "default" 1.0 0.0 "DEFAULT")
-            (ICons ?a (ICons ?b (INil)))))
-        (union ?sum ?sgemm)
-        (set (dtype ?sgemm) ?dt)
-    )
-    :ruleset matmul_backend
-    :name "cublaslt row-order batched column-major x column-major"
-)
--- a/crates/luminal_cuda_lite/src/host/cublaslt/cublaslt_scale_rewrite.egg
+++ b/crates/luminal_cuda_lite/src/host/cublaslt/cublaslt_scale_rewrite.egg
@@ -1,316 +0,0 @@
-; Scalar alpha/beta rewrites for cuBLASLt. These rules target scalar constants
-; expanded across the matmul/add shape, i.e. zero strides on every logical axis.
-
-(rule
-    (
-        (= ?matmul (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order ?c_order ?d_order
-            ?lda ?ldb ?ldc ?ldd
-            ?batch
-            ?stride_a ?stride_b ?stride_c ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype ?d_dtype
-            ?compute_type ?scale_dtype
-            1.0 0.0 "DEFAULT")
-            (ICons ?a (ICons ?b ?matmul_tail))))
-
-        (= ?scale (Op (Constant ?alpha) (INil)))
-        ; alpha=1.0 hash-conses ?fused == ?matmul; the union merges Mul into ?matmul's eclass and saturate diverges. 
-        (!= ?alpha 1.0)
-        (= ?scaled (Op (Mul ?shape
-            ?matmul_strides
-            (ECons (MNum 0) (ECons (MNum 0) (ENil)))
-            ?scaled_out_strides)
-            (ICons ?matmul (ICons ?scale (INil)))))
-        (= ?matmul_strides ?scaled_out_strides)
-    )
-    (
-        (let ?fused (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order ?c_order ?d_order
-            ?lda ?ldb ?ldc ?ldd
-            ?batch
-            ?stride_a ?stride_b ?stride_c ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype ?d_dtype
-            ?compute_type ?scale_dtype
-            ?alpha 0.0 "DEFAULT")
-            (ICons ?a (ICons ?b ?matmul_tail))))
-        (union ?scaled ?fused)
-        (set (dtype ?fused) ?d_dtype)
-    )
-    :ruleset matmul_backend
-    :name "cublaslt 2d alpha scale"
-)
-
-(rule
-    (
-        (= ?matmul (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order ?c_order ?d_order
-            ?lda ?ldb ?ldc ?ldd
-            ?batch
-            ?stride_a ?stride_b ?stride_c ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype ?d_dtype
-            ?compute_type ?scale_dtype
-            1.0 0.0 "DEFAULT")
-            (ICons ?a (ICons ?b ?matmul_tail))))
-
-        (= ?scale (Op (Constant ?alpha) (INil)))
-        ; See 2d alpha scale: alpha=1.0 makes (saturate ...) diverge.   
-        (!= ?alpha 1.0)
-        (= ?scaled (Op (Mul ?shape
-            ?matmul_strides
-            (ECons (MNum 0) (ECons (MNum 0) (ECons (MNum 0) (ENil))))
-            ?scaled_out_strides)
-            (ICons ?matmul (ICons ?scale (INil)))))
-        (= ?matmul_strides ?scaled_out_strides)
-    )
-    (
-        (let ?fused (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order ?c_order ?d_order
-            ?lda ?ldb ?ldc ?ldd
-            ?batch
-            ?stride_a ?stride_b ?stride_c ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype ?d_dtype
-            ?compute_type ?scale_dtype
-            ?alpha 0.0 "DEFAULT")
-            (ICons ?a (ICons ?b ?matmul_tail))))
-        (union ?scaled ?fused)
-        (set (dtype ?fused) ?d_dtype)
-    )
-    :ruleset matmul_backend
-    :name "cublaslt batched alpha scale"
-)
-
-(rule
-    (
-        (= ?matmul (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order ?matmul_c_order "ROW"
-            ?lda ?ldb ?matmul_ldc ?ldd
-            (MNum 1)
-            ?stride_a ?stride_b ?matmul_stride_c ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype ?d_dtype
-            ?compute_type ?scale_dtype
-            ?alpha 0.0 ?epilogue)
-            (ICons ?a (ICons ?b ?matmul_tail))))
-
-        (= ?beta_node (Op (Constant ?beta) (INil)))
-        (= ?scaled_c (Op (Mul
-            (ECons ?m (ECons ?n (ENil)))
-            ?c_strides
-            (ECons (MNum 0) (ECons (MNum 0) (ENil)))
-            ?scaled_c_out_strides)
-            (ICons ?c (ICons ?beta_node (INil)))))
-
-        (= ?add (Op (Add
-            (ECons ?m (ECons ?n (ENil)))
-            ?matmul_add_strides
-            ?scaled_c_add_strides
-            ?add_out_strides)
-            (ICons ?matmul (ICons ?scaled_c (INil)))))
-
-        (= ?matmul_add_strides (ECons ?d_row_stride (ECons ?d_col_stride (ENil))))
-        (= ?c_strides (ECons ?c_row_stride (ECons ?c_col_stride (ENil))))
-        (= ?add_out_strides (ECons ?d_row_stride (ECons ?d_col_stride (ENil))))
-        (= ?scaled_c_add_strides ?scaled_c_out_strides)
-        (= ?c_col_stride (MIter))
-        (!= ?c_row_stride (MNum 0))
-        (= ?matmul_add_strides ?add_out_strides)
-        (= ?c_dtype (dtype ?c))
-    )
-    (
-        (let ?fused (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order "ROW" "ROW"
-            ?lda ?ldb ?c_row_stride ?ldd
-            (MNum 1)
-            ?stride_a ?stride_b (MNum 0) ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype ?d_dtype
-            ?compute_type ?scale_dtype
-            ?alpha ?beta ?epilogue)
-            (ICons ?a (ICons ?b (ICons ?c ?matmul_tail)))))
-        (union ?add ?fused)
-        (set (dtype ?fused) ?d_dtype)
-    )
-    :ruleset matmul_backend
-    :name "cublaslt row-order 2d scaled c beta"
-)
-
-(rule
-    (
-        (= ?matmul (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order ?matmul_c_order "ROW"
-            ?lda ?ldb ?matmul_ldc ?ldd
-            (MNum 1)
-            ?stride_a ?stride_b ?matmul_stride_c ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype ?d_dtype
-            ?compute_type ?scale_dtype
-            ?alpha 0.0 ?epilogue)
-            (ICons ?a (ICons ?b ?matmul_tail))))
-
-        (= ?beta_node (Op (Constant ?beta) (INil)))
-        (= ?scaled_c (Op (Mul
-            (ECons ?m (ECons ?n (ENil)))
-            ?c_strides
-            (ECons (MNum 0) (ECons (MNum 0) (ENil)))
-            ?scaled_c_out_strides)
-            (ICons ?c (ICons ?beta_node (INil)))))
-
-        (= ?add (Op (Add
-            (ECons ?m (ECons ?n (ENil)))
-            ?scaled_c_add_strides
-            ?matmul_add_strides
-            ?add_out_strides)
-            (ICons ?scaled_c (ICons ?matmul (INil)))))
-
-        (= ?matmul_add_strides (ECons ?d_row_stride (ECons ?d_col_stride (ENil))))
-        (= ?c_strides (ECons ?c_row_stride (ECons ?c_col_stride (ENil))))
-        (= ?add_out_strides (ECons ?d_row_stride (ECons ?d_col_stride (ENil))))
-        (= ?scaled_c_add_strides ?scaled_c_out_strides)
-        (= ?c_col_stride (MIter))
-        (!= ?c_row_stride (MNum 0))
-        (= ?matmul_add_strides ?add_out_strides)
-        (= ?c_dtype (dtype ?c))
-    )
-    (
-        (let ?fused (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order "ROW" "ROW"
-            ?lda ?ldb ?c_row_stride ?ldd
-            (MNum 1)
-            ?stride_a ?stride_b (MNum 0) ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype ?d_dtype
-            ?compute_type ?scale_dtype
-            ?alpha ?beta ?epilogue)
-            (ICons ?a (ICons ?b (ICons ?c ?matmul_tail)))))
-        (union ?add ?fused)
-        (set (dtype ?fused) ?d_dtype)
-    )
-    :ruleset matmul_backend
-    :name "cublaslt row-order 2d scaled c plus matmul beta"
-)
-
-(rule
-    (
-        (= ?matmul (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order ?matmul_c_order "ROW"
-            ?lda ?ldb ?matmul_ldc ?ldd
-            ?batch
-            ?stride_a ?stride_b ?matmul_stride_c ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype ?d_dtype
-            ?compute_type ?scale_dtype
-            ?alpha 0.0 ?epilogue)
-            (ICons ?a (ICons ?b ?matmul_tail))))
-
-        (= ?beta_node (Op (Constant ?beta) (INil)))
-        (= ?scaled_c (Op (Mul
-            (ECons ?batch (ECons ?m (ECons ?n (ENil))))
-            ?c_strides
-            (ECons (MNum 0) (ECons (MNum 0) (ECons (MNum 0) (ENil))))
-            ?scaled_c_out_strides)
-            (ICons ?c (ICons ?beta_node (INil)))))
-
-        (= ?add (Op (Add
-            (ECons ?batch (ECons ?m (ECons ?n (ENil))))
-            ?matmul_add_strides
-            ?scaled_c_add_strides
-            ?add_out_strides)
-            (ICons ?matmul (ICons ?scaled_c (INil)))))
-
-        (= ?matmul_add_strides (ECons ?d_batch_stride (ECons ?d_row_stride (ECons ?d_col_stride (ENil)))))
-        (= ?c_strides (ECons ?c_batch_stride (ECons ?c_row_stride (ECons ?c_col_stride (ENil)))))
-        (= ?add_out_strides (ECons ?d_batch_stride (ECons ?d_row_stride (ECons ?d_col_stride (ENil)))))
-        (= ?scaled_c_add_strides ?scaled_c_out_strides)
-        (= ?c_col_stride (MIter))
-        (!= ?c_row_stride (MNum 0))
-        (= ?matmul_add_strides ?add_out_strides)
-        (= ?c_dtype (dtype ?c))
-    )
-    (
-        (let ?fused (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order "ROW" "ROW"
-            ?lda ?ldb ?c_row_stride ?ldd
-            ?batch
-            ?stride_a ?stride_b ?c_batch_stride ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype ?d_dtype
-            ?compute_type ?scale_dtype
-            ?alpha ?beta ?epilogue)
-            (ICons ?a (ICons ?b (ICons ?c ?matmul_tail)))))
-        (union ?add ?fused)
-        (set (dtype ?fused) ?d_dtype)
-    )
-    :ruleset matmul_backend
-    :name "cublaslt row-order batched scaled c beta"
-)
-
-(rule
-    (
-        (= ?matmul (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order ?matmul_c_order "ROW"
-            ?lda ?ldb ?matmul_ldc ?ldd
-            ?batch
-            ?stride_a ?stride_b ?matmul_stride_c ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype ?d_dtype
-            ?compute_type ?scale_dtype
-            ?alpha 0.0 ?epilogue)
-            (ICons ?a (ICons ?b ?matmul_tail))))
-
-        (= ?beta_node (Op (Constant ?beta) (INil)))
-        (= ?scaled_c (Op (Mul
-            (ECons ?batch (ECons ?m (ECons ?n (ENil))))
-            ?c_strides
-            (ECons (MNum 0) (ECons (MNum 0) (ECons (MNum 0) (ENil))))
-            ?scaled_c_out_strides)
-            (ICons ?c (ICons ?beta_node (INil)))))
-
-        (= ?add (Op (Add
-            (ECons ?batch (ECons ?m (ECons ?n (ENil))))
-            ?scaled_c_add_strides
-            ?matmul_add_strides
-            ?add_out_strides)
-            (ICons ?scaled_c (ICons ?matmul (INil)))))
-
-        (= ?matmul_add_strides (ECons ?d_batch_stride (ECons ?d_row_stride (ECons ?d_col_stride (ENil)))))
-        (= ?c_strides (ECons ?c_batch_stride (ECons ?c_row_stride (ECons ?c_col_stride (ENil)))))
-        (= ?add_out_strides (ECons ?d_batch_stride (ECons ?d_row_stride (ECons ?d_col_stride (ENil)))))
-        (= ?scaled_c_add_strides ?scaled_c_out_strides)
-        (= ?c_col_stride (MIter))
-        (!= ?c_row_stride (MNum 0))
-        (= ?matmul_add_strides ?add_out_strides)
-        (= ?c_dtype (dtype ?c))
-    )
-    (
-        (let ?fused (Op (cublaslt
-            ?m ?n ?k
-            ?a_layout ?b_layout
-            ?a_order ?b_order "ROW" "ROW"
-            ?lda ?ldb ?c_row_stride ?ldd
-            ?batch
-            ?stride_a ?stride_b ?c_batch_stride ?stride_d
-            ?a_dtype ?b_dtype ?c_dtype ?d_dtype
-            ?compute_type ?scale_dtype
-            ?alpha ?beta ?epilogue)
-            (ICons ?a (ICons ?b (ICons ?c ?matmul_tail)))))
-        (union ?add ?fused)
-        (set (dtype ?fused) ?d_dtype)
-    )
-    :ruleset matmul_backend
-    :name "cublaslt row-order batched scaled c plus matmul beta"
-)
--- a/crates/luminal_cuda_lite/src/host/cublaslt/mod.rs
+++ b/crates/luminal_cuda_lite/src/host/cublaslt/mod.rs
--- a/crates/luminal_cuda_lite/src/host/flashinfer/README.md
+++ b/crates/luminal_cuda_lite/src/host/flashinfer/README.md
@@ -1,124 +0,0 @@
-# FlashInfer Integration
-
-FlashInfer replaces the multi-op attention pattern (Q×K^T → scale → mask → softmax → ×V) with a single fused GPU kernel via [FlashInfer](https://github.com/flashinfer-ai/flashinfer)'s batch decode and batch prefill APIs.
-
-## Current State
-
-**Working:**
- Egglog rewrite rule matches any GQA paged attention pattern (model-agnostic shapes)
- GA search selects FlashInfer when it wins profiling — verified on Llama 3 8B (32 layers) and Qwen 3 4B (36 layers)
- **BatchDecode** (s=1): fp32 natively — FlashInfer's decode kernel uses scalar vectorized dot products, no tensor cores
- **BatchPrefill**: template-instantiated for fp16 but **not callable from fp32** — FlashInfer's prefill kernel requires tensor core MMA (`mma.sync.aligned.m16n8k16`) and `ldmatrix` which physically only operate on 16-bit types; the C API stubs return -1 for fp32; will be enabled when native fp16/bf16 pipeline is added
- Decode handles all cases in the current fp32 pipeline (prefill uses cuBLAS attention via dim bucketing)
- Indptr-based mask: `qo_indptr` and `kv_indptr` are computed in-graph so the egglog rule can see them in the same chunk as the attention ops
-
-**Not yet implemented:**
- Native fp16 / bf16 pipeline (would eliminate the cast overhead in prefill)
- Page sizes > 1
-
---
-
-## File Organization
-
-```
-src/host/flashinfer/
-  flashinfer_attention.egg  — egglog rewrite rule (pattern match → FlashInferAttention)
-  mod.rs                    — FlashInferAttention op (EgglogOp + HostOp impl)
-  jit.rs                    — JIT compilation: nvcc wrapper.cu → .so, dlopen, fn pointers
-  find_indptrs.rs           — walks the mask e-graph node to locate qo_indptr / kv_indptr inputs
-  wrapper.cu                — CUDA: FlashInfer template instantiation + helper kernels
-  wrapper.h                 — C API header for wrapper.cu
-  README.md                 — this file
-```
-
-## How It Works
-
-### 1. Egglog Pattern Matching
-
-The rule in `flashinfer_attention.egg` matches the structural pattern of paged GQA attention:
-
-```
-Gather(K_cache, idx) → GQA broadcast (Mul×1.0) → Q×K^T → Sum → scale → mask Add → softmax → attn×V → Sum → output
-Gather(V_cache, idx) → GQA broadcast (Mul×1.0) ──────────────────────────────────────────→ attn×V → Sum → output
-```
-
-Key anchors that prevent false matches on MLP or other ops:
- Two Gather ops from 2D cache pools (MLP never uses Gather)
- GQA broadcast via `Mul(gathered, Constant(1.0))` with all-zero strides
- Mask Add with zero-stride broadcast in the first (nheads) dimension
- Two sequential matmul+Sum pairs connected through softmax
-
-Shape dimensions are egglog variables, not pinned constants — the rule works for any model with GQA (Llama, Qwen, Mistral, etc.). The structural invariants (dimension count, zero-stride positions, Gather from 2D) are enough to avoid combinatorial explosion during saturation.
-
-When the rule fires, it unions `FlashInferAttention` with the original attention output, making it an equivalent alternative in the e-graph. The GA search then profiles both paths and picks the faster one.
-
-### 2. Extraction: Finding Indptrs
-
-During `extract()` (called when egglog selects the FlashInferAttention e-node), `find_indptrs.rs` walks backward from the mask node in the e-graph to locate the `qo_indptr` and `kv_indptr` Input nodes. It validates the mask structure by checking for the `Mul(allowed, Constant(1e10))` pattern that `compute_attn_mask()` produces.
-
-The indptrs are appended as inputs 5 and 6 to the FlashInferAttention op, so the runtime can build the CSR page table directly without recomputing anything.
-
-### 3. JIT Compilation
-
-FlashInfer requires `HEAD_DIM` as a compile-time template parameter. Rather than baking it at `cargo build` time, `jit.rs` JIT-compiles `wrapper.cu` with the model's actual HEAD_DIM:
-
-1. First call to `ensure_compiled(head_dim)` runs `nvcc` with `-DLUMINAL_HEAD_DIM=<N>`
-2. The compiled `.so` is cached at `~/.cache/luminal/flashinfer/libflashinfer_hd<N>_<arch>.so`
-3. Subsequent calls load the cached library via `dlopen`
-4. Function pointers (plan, run, transpose, etc.) are resolved and stored in a `static OnceLock`
-
-Supported HEAD_DIM values: 64, 128, 256.
-
-### 4. Runtime Execution
-
-`FlashInferAttention::execute()` dispatches to decode or prefill based on `total_q_tokens vs batch_size`:
-
-**Common steps:**
-1. **Extract kv_indices** — a helper kernel converts the flat gather index `(c, KV_DIM)` to slot indices `(c,)`
-2. **Read indptrs to host** — copied to CPU for the plan phase
-3. **Plan** — queries GPU occupancy and decides split-KV decomposition
-4. **Run** — the fused kernel writes `(total_q_tokens, num_qo_heads, head_dim)`
-5. **Transpose** — transposes to `(num_qo_heads, total_q_tokens, head_dim)` to match the Sum reduction layout
-
-**Decode path** (current, fp32): Always used. Runs FlashInfer's BatchDecode directly on fp32 buffers.
-
-**Prefill path** (future, fp16/bf16 only): The prefill kernel templates are compiled into the JIT .so for fp16 (CTA_TILE_Q=16/64/128, causal mask). The C API stubs currently return -1 since the pipeline is fp32. When native fp16/bf16 dtype support is added, `execute()` will dispatch to prefill when `total_q_tokens > batch_size`.
-
-Global workspaces (`static OnceLock`) are shared across all FlashInferAttention instances to avoid ~4ms allocation overhead per GA profiling candidate. Without this, the GA never selects FlashInfer because the first-run allocation cost dwarfs the kernel time.
-
-## How the Attention Mask Enables FlashInfer
-
-For the egglog rule to fire, the `qo_indptr` and `kv_indptr` tensors must be visible in the same e-graph chunk as the attention ops. This is why the mask is computed *inside* each layer (via `compute_attn_mask()` in the model) rather than passed as a pre-computed input.
-
-The mask computation uses a specific structure:
-```rust
-let allowed = same_request * causal;
-allowed * 1e10 - 1e10    // → 0.0 for allowed, -1e10 for blocked
-```
-
-The `Mul(allowed, Constant(1e10))` pattern is the anchor that `find_indptrs.rs` uses to walk backward and locate the indptr inputs.
-
-## Roadmap
-
-Items listed in priority order. Checked items are done.
-
- [x] Model-agnostic egglog rule (shape variables instead of Llama-specific constants)
- [x] bs>1 supersequence decode
- [x] Indptr-based attention mask (replaces CPU-computed mask)
- [x] Multi-model support (verified on Llama 3 8B and Qwen 3 4B)
- [x] BatchPrefill kernel compiled for fp16 (causal mask, CTA_TILE_Q=16/64/128)
- [ ] Native fp16 / bf16 pipeline (enables prefill, reduces memory, eliminates cuBLAS prefill fallback)
- [ ] HEAD_DIM dispatch for 64, 96 (JIT supports 64/128/256; wrapper.cu needs 96 for Phi)
- [ ] Page sizes > 1 (currently page_size=1; larger pages reduce CSR overhead)
- [ ] Sliding window, ALiBi, logits soft cap (FlashInfer `AttentionVariant` templates)
- [ ] MHA / MQA / arbitrary GQA ratios beyond {1, 2, 4, 8}
-
-## Key Design Decisions
-
- **page_size=1**: Each KV cache slot is one "page". This simplifies the CSR page table (`kv_indices` = physical slot indices directly) and matches the flat `(num_slots, KV_DIM)` cache layout.
-
- **Pinned structural anchors**: The egglog rule pins the *structure* (number of dimensions, which dims are zero-stride, presence of Gather from 2D cache) but uses variables for the *values* (head counts, head_dim). This prevents saturation blowup while remaining model-agnostic.
-
- **Prefill requires fp16/bf16**: FlashInfer's prefill kernel uses tensor core MMA instructions (`mma.sync.aligned.m16n8k16`) and `ldmatrix` which physically require 16-bit inputs — there is no fp32 tensor core matmul instruction. The prefill kernel templates are compiled into the .so for fp16 but the C API returns -1 for fp32 callers. When native fp16/bf16 is added, prefill will be enabled automatically.
-
- **Global workspaces**: Float workspace (128 MiB), int workspace (8 MiB), and a page-locked host buffer are allocated once via `static OnceLock` and shared across all instances.
--- a/crates/luminal_cuda_lite/src/host/flashinfer/find_indptrs.rs
+++ b/crates/luminal_cuda_lite/src/host/flashinfer/find_indptrs.rs
@@ -1,328 +0,0 @@
-//! Walk the e-graph from the mask node to find qo_indptr and kv_indptr Input nodes.
-//!
-//! The mask is produced by `compute_attn_mask(q_pos, qo_indptr, kv_indptr)` using
-//! primitive HLIR ops. This module validates the mask's structure and extracts the
-//! indptr Input node IDs so FlashInfer can use them directly.
-
-use luminal::egglog_utils::{ClassId, NodeId, SerializedEGraph};
-use luminal::prelude::FxHashSet;
-
-/// Result of walking the mask computation chain.
-#[derive(Debug)]
-pub struct IndptrNodes<'a> {
-    pub qo_indptr: &'a NodeId,
-    pub kv_indptr: &'a NodeId,
-}
-
-/// Find the qo_indptr and kv_indptr Input nodes by walking backwards from the mask.
-///
-/// Validates the mask structure: `allowed * 1e10 + (-1e10)`. Then does a BFS from
-/// the `allowed` subtree to find all reachable Input nodes with names containing
-/// "qo_indptr" and "kv_indptr".
-///
-/// Panics with a diagnostic message if the structure doesn't match or the
-/// indptr inputs can't be found.
-pub fn find_indptr_inputs<'a>(
-    egraph: &'a SerializedEGraph,
-    mask_node: &'a NodeId,
-) -> IndptrNodes<'a> {
-    // Step 1: Validate mask = Add(scaled_allowed, neg_constant)
-    let mask_inputs = logical_binary_inputs(egraph, mask_node, "Add").unwrap_or_else(|| {
-        let (mask_label, mask_children) = &egraph.enodes[mask_node];
-        assert!(
-            mask_label == "Op",
-            "find_indptr_inputs: mask node is not an Op (label={mask_label})"
-        );
-        let mask_kind = resolve_first_node(egraph, &mask_children[0]);
-        let mask_kind_label = &egraph.enodes[mask_kind].0;
-        panic!("find_indptr_inputs: mask is not an Add (kind={mask_kind_label})");
-    });
-    assert_eq!(
-        mask_inputs.len(),
-        2,
-        "find_indptr_inputs: mask Add should have 2 inputs, got {}",
-        mask_inputs.len()
-    );
-
-    // Step 2: One of the inputs should be Mul(allowed, Constant(1e10))
-    let (scaled_allowed, allowed_node) = find_1e10_mul(egraph, &mask_inputs);
-
-    // Step 3: BFS from `allowed` to find all reachable Input nodes
-    let reachable_inputs = find_reachable_inputs(egraph, allowed_node);
-
-    // Step 4: Match by name
-    let mut qo_indptr: Option<&NodeId> = None;
-    let mut kv_indptr: Option<&NodeId> = None;
-
-    for (node_id, name) in &reachable_inputs {
-        if name.contains("qo_indptr") {
-            qo_indptr = Some(node_id);
-        } else if name.contains("kv_indptr") {
-            kv_indptr = Some(node_id);
-        }
-    }
-
-    let qo = qo_indptr.unwrap_or_else(|| {
-        let found_names: Vec<&str> = reachable_inputs.iter().map(|(_, n)| n.as_str()).collect();
-        panic!(
-            "find_indptr_inputs: could not find 'qo_indptr' Input reachable from mask.\n\
-             Found inputs: {:?}\n\
-             Mask node: {:?}\n\
-             Scaled allowed node: {:?}",
-            found_names, mask_node, scaled_allowed
-        );
-    });
-
-    let kv = kv_indptr.unwrap_or_else(|| {
-        let found_names: Vec<&str> = reachable_inputs.iter().map(|(_, n)| n.as_str()).collect();
-        panic!(
-            "find_indptr_inputs: could not find 'kv_indptr' Input reachable from mask.\n\
-             Found inputs: {:?}\n\
-             Mask node: {:?}\n\
-             Scaled allowed node: {:?}",
-            found_names, mask_node, scaled_allowed
-        );
-    });
-
-    IndptrNodes {
-        qo_indptr: qo,
-        kv_indptr: kv,
-    }
-}
-
-fn find_1e10_mul<'a>(
-    egraph: &'a SerializedEGraph,
-    mask_add_inputs: &[&'a NodeId],
-) -> (&'a NodeId, &'a NodeId) {
-    for &input_node in mask_add_inputs {
-        let Some(mul_inputs) = logical_binary_inputs(egraph, input_node, "Mul") else {
-            continue;
-        };
-        if mul_inputs.len() != 2 {
-            continue;
-        }
-        for (i, &inp) in mul_inputs.iter().enumerate() {
-            if is_constant(egraph, inp, 1e10) {
-                let other = mul_inputs[1 - i];
-                return (input_node, other);
-            }
-        }
-    }
-    let mut debug_info = String::new();
-    for (i, &input_node) in mask_add_inputs.iter().enumerate() {
-        let (label, children) = &egraph.enodes[input_node];
-        debug_info.push_str(&format!("\n  input[{i}]: label={label}"));
-        if label == "Op" && !children.is_empty() {
-            let kind = resolve_first_node(egraph, &children[0]);
-            let kind_label = &egraph.enodes[kind].0;
-            debug_info.push_str(&format!(" kind={kind_label}"));
-            for (j, kc) in egraph.enodes[kind].1.iter().enumerate() {
-                let kc_node = resolve_first_node(egraph, kc);
-                debug_info.push_str(&format!(" child[{j}]={}", egraph.enodes[kc_node].0));
-            }
-            if kind_label.contains("Mul") && children.len() >= 2 {
-                let mul_inputs = walk_ilist_simple(egraph, &children[1]);
-                for (j, &mi) in mul_inputs.iter().enumerate() {
-                    let (ml, mc) = &egraph.enodes[mi];
-                    debug_info.push_str(&format!("\n    mul_input[{j}]: label={ml}"));
-                    if ml == "Op" && !mc.is_empty() {
-                        let mk = resolve_first_node(egraph, &mc[0]);
-                        debug_info.push_str(&format!(" kind={}", egraph.enodes[mk].0));
-                        for (k, mkc) in egraph.enodes[mk].1.iter().enumerate() {
-                            let mkc_node = resolve_first_node(egraph, mkc);
-                            debug_info.push_str(&format!(" ch[{k}]={}", egraph.enodes[mkc_node].0));
-                        }
-                    }
-                }
-            }
-        }
-    }
-    panic!(
-        "find_indptr_inputs: could not find Mul(allowed, Constant(1e10)) in mask Add inputs.{debug_info}"
-    );
-}
-
-fn is_constant(egraph: &SerializedEGraph, node: &NodeId, expected: f32) -> bool {
-    let node = resolve_op_with_kind(egraph, node, "Constant").unwrap_or(node);
-    let (label, children) = &egraph.enodes[node];
-    if label != "Op" {
-        return false;
-    }
-    let kind = resolve_first_node(egraph, &children[0]);
-    let kind_label = &egraph.enodes[kind].0;
-    if !kind_label.contains("Constant") {
-        return false;
-    }
-    let val_children = &egraph.enodes[kind].1;
-    if val_children.is_empty() {
-        return false;
-    }
-    let val_node = resolve_first_node(egraph, &val_children[0]);
-    let val_str = &egraph.enodes[val_node].0;
-    if let Ok(val) = val_str.parse::<f64>() {
-        (val as f32 - expected).abs() < 1.0
-    } else {
-        false
-    }
-}
-
-fn find_reachable_inputs<'a>(
-    egraph: &'a SerializedEGraph,
-    start: &'a NodeId,
-) -> Vec<(&'a NodeId, String)> {
-    let mut found = Vec::new();
-    let mut visited = FxHashSet::default();
-    let mut stack = vec![start];
-
-    while let Some(node) = stack.pop() {
-        if !visited.insert(node) {
-            continue;
-        }
-
-        let (label, children) = &egraph.enodes[node];
-
-        if label == "Input" {
-            if children.len() >= 2 {
-                let name_node = resolve_first_node(egraph, &children[1]);
-                let name = egraph.enodes[name_node].0.trim_matches('"').to_string();
-                found.push((node, name));
-            }
-            continue;
-        }
-
-        if label == "Op" && children.len() >= 2 {
-            let ir_inputs = walk_ilist_simple(egraph, &children[1]);
-            for inp in ir_inputs {
-                stack.push(inp);
-            }
-        }
-    }
-
-    found
-}
-
-fn walk_ilist_simple<'a>(
-    egraph: &'a SerializedEGraph,
-    ilist_eclass: &'a ClassId,
-) -> Vec<&'a NodeId> {
-    let mut inputs = Vec::new();
-    let mut current = resolve_first_node(egraph, ilist_eclass);
-
-    loop {
-        let (label, children) = &egraph.enodes[current];
-        if label == "INil" {
-            break;
-        }
-        if label != "ICons" {
-            break;
-        }
-        let ir_node = resolve_first_ir_node(egraph, &children[0]);
-        inputs.push(ir_node);
-        current = resolve_first_node(egraph, &children[1]);
-    }
-
-    inputs
-}
-
-fn resolve_first_node<'a>(egraph: &'a SerializedEGraph, eclass: &ClassId) -> &'a NodeId {
-    &egraph.eclasses[eclass].1[0]
-}
-
-fn resolve_first_ir_node<'a>(egraph: &'a SerializedEGraph, eclass: &ClassId) -> &'a NodeId {
-    let nodes = &egraph.eclasses[eclass].1;
-    for node in nodes {
-        let label = &egraph.enodes[node].0;
-        if label == "Op" || label == "Input" {
-            return node;
-        }
-    }
-    &nodes[0]
-}
-
-fn resolve_op_with_kind<'a>(
-    egraph: &'a SerializedEGraph,
-    node: &'a NodeId,
-    kind_substr: &str,
-) -> Option<&'a NodeId> {
-    let class = egraph.node_to_class.get(node)?;
-    for candidate in &egraph.eclasses[class].1 {
-        let (label, children) = &egraph.enodes[candidate];
-        if label != "Op" || children.is_empty() {
-            continue;
-        }
-        let kind = resolve_first_node(egraph, &children[0]);
-        if egraph.enodes[kind].0.contains(kind_substr) {
-            return Some(candidate);
-        }
-    }
-    None
-}
-
-fn logical_binary_inputs<'a>(
-    egraph: &'a SerializedEGraph,
-    node: &'a NodeId,
-    op_name: &str,
-) -> Option<Vec<&'a NodeId>> {
-    if let Some(op_node) = resolve_op_with_kind(egraph, node, op_name) {
-        let (_, children) = &egraph.enodes[op_node];
-        return Some(walk_ilist_simple(egraph, &children[1]));
-    }
-
-    let (label, children) = &egraph.enodes[node];
-    if label != "Op" || children.len() < 2 {
-        return None;
-    }
-    let kind = resolve_first_node(egraph, &children[0]);
-    if egraph.enodes[kind].0.contains("CudaBinaryElementwise") {
-        let opcode_class = egraph.enodes[kind].1.first()?;
-        let opcode_node = resolve_first_node(egraph, opcode_class);
-        if egraph.enodes[opcode_node].0.trim_matches('"') != op_name {
-            return None;
-        }
-        return Some(
-            walk_ilist_simple(egraph, &children[1])
-                .into_iter()
-                .map(|input| unwrap_fusion_start(egraph, input))
-                .collect(),
-        );
-    }
-    if !egraph.enodes[kind].0.contains("FusionEnd") {
-        return None;
-    }
-    let fe_inputs = walk_ilist_simple(egraph, &children[1]);
-    let elem = *fe_inputs.first()?;
-    let (elem_label, elem_children) = &egraph.enodes[elem];
-    if elem_label != "Op" || elem_children.len() < 2 {
-        return None;
-    }
-    let elem_kind = resolve_first_node(egraph, &elem_children[0]);
-    if !egraph.enodes[elem_kind].0.contains("CudaBinaryElementwise") {
-        return None;
-    }
-    let opcode_class = egraph.enodes[elem_kind].1.first()?;
-    let opcode_node = resolve_first_node(egraph, opcode_class);
-    if egraph.enodes[opcode_node].0.trim_matches('"') != op_name {
-        return None;
-    }
-    Some(
-        walk_ilist_simple(egraph, &elem_children[1])
-            .into_iter()
-            .map(|input| unwrap_fusion_start(egraph, input))
-            .collect(),
-    )
-}
-
-fn unwrap_fusion_start<'a>(egraph: &'a SerializedEGraph, node: &'a NodeId) -> &'a NodeId {
-    let (label, children) = &egraph.enodes[node];
-    if label != "Op" || children.len() < 2 {
-        return node;
-    }
-    let kind = resolve_first_node(egraph, &children[0]);
-    if !egraph.enodes[kind].0.contains("FusionStart") {
-        return node;
-    }
-    walk_ilist_simple(egraph, &children[1])
-        .first()
-        .copied()
-        .unwrap_or(node)
-}
--- a/crates/luminal_cuda_lite/src/host/flashinfer/flashinfer_attention.egg
+++ b/crates/luminal_cuda_lite/src/host/flashinfer/flashinfer_attention.egg
@@ -1,135 +0,0 @@
-; FlashInfer batch decode attention rewrite rule.
-;
-; Matches the paged attention pattern for ANY model with GQA:
-;   Gather(K_cache) → GQA broadcast → Q*K^T matmul → scale → add mask → softmax → attn*V matmul
-;   Gather(V_cache) → GQA broadcast ──────────────────────────────────────────→ attn*V matmul
-;
-; Structural anchors (prevent false matches on MLP/other ops):
-;   - Gather ops from 2D cache pools (MLP never uses Gather)
-;   - GQA broadcast via Mul(gathered, Constant(1.0)) with all-zero strides
-;   - Scale Mul(QK, constant) connecting QK scores to mask Add
-;   - Mask Add with zero-stride broadcast in first dim (nheads broadcast)
-;   - Data flow: two sequential matmul+reduce pairs connected through softmax
-;
-; The egglog rule captures the mask as 5th input.  During extract(), a Rust
-; function walks the mask's computation chain in the e-graph to locate the
-; qo_indptr and kv_indptr Input nodes (validated via the Constant(1e10) anchor
-; and structural checks).  These are appended as inputs 5 and 6 so FlashInfer
-; can build the CSR page table directly — no runtime derivation needed.
-;
-; Shape dimensions are egglog variables, not pinned constants.
-; Dynamic dims "s" (batch/seq) and "c" (context) stay pinned as MVar.
-
-(rule
-    (
-        ; ── Second matmul: Mul(softmax_out, V_gqa) ──
-        ; Shape: (nheads, s, hdim, c) — 4D
-        (= ?mul2 (Op (Mul
-            (ECons ?nheads (ECons (MVar "s") (ECons ?hdim (ECons (MVar "c") (ENil)))))
-            ?mul2_a_strides
-            ?mul2_b_strides
-            ?mul2_out_strides)
-            (ICons ?soft (ICons ?v_gqa (INil)))))
-
-        ; ── Second matmul: Sum (reduction over c) → output ──
-        ; Shape: (nheads, s, hdim) — reduces c
-        (= ?output (Op (Sum
-            (ECons ?nheads2 (ECons (MVar "s") (ECons ?hdim2 (ENil))))
-            (MVar "c")
-            ?out_in_strides
-            (MIter)
-            ?out_out_strides)
-            (ICons ?mul2 (INil))))
-
-        ; ── V GQA broadcast: Mul(V_gathered, 1.0) with zero-stride constant ──
-        ; Shape: (nheads, c, hdim) — 3D
-        (= ?v_gqa_const (Op (Constant 1.000000) (INil)))
-        (= ?v_gqa (Op (Mul
-            (ECons ?nheads3 (ECons (MVar "c") (ECons ?hdim3 (ENil))))
-            ?v_gqa_a_strides
-            (ECons (MNum 0) (ECons (MNum 0) (ECons (MNum 0) (ENil))))
-            ?v_gqa_out_strides)
-            (ICons ?v_gathered (ICons ?v_gqa_const (INil)))))
-
-        ; ── V Gather: rows from V_cache (2D) ──
-        ; Shape: (c, kvdim), Source: (num_slots, kvdim)
-        (= ?v_gathered (Op (Gather
-            (ECons (MVar "c") (ECons ?kvdim (ENil)))
-            ?v_gather_strides
-            (ECons ?num_slots_v (ECons ?kvdim2 (ENil)))
-            ?v_src_strides)
-            (ICons ?v_idx (ICons ?v_cache (INil)))))
-
-        ; ── First matmul: Mul(Q, K_gqa) ──
-        ; Shape: (nheads, s, c, hdim) — 4D
-        (= ?mul1 (Op (Mul
-            (ECons ?nheads4 (ECons (MVar "s") (ECons (MVar "c") (ECons ?hdim4 (ENil)))))
-            ?mul1_a_strides
-            ?mul1_b_strides
-            ?mul1_out_strides)
-            (ICons ?q (ICons ?k_gqa (INil)))))
-
-        ; ── First matmul: Sum (reduction over hdim) → QK scores ──
-        ; Shape: (nheads, s, c) — reduces hdim
-        (= ?qk (Op (Sum
-            (ECons ?nheads5 (ECons (MVar "s") (ECons (MVar "c") (ENil))))
-            ?hdim5
-            ?qk_in_strides
-            (MIter)
-            ?qk_out_strides)
-            (ICons ?mul1 (INil))))
-
-        ; ── Mask Add: Add(scaled_QK, mask) ──
-        ; Shape: (nheads, s, c) — 3D
-        ; Mask is broadcast from (s, c) via zero-stride in first dim (nheads).
-        (= ?masked (Op (Add
-            (ECons ?nheads8 (ECons (MVar "s") (ECons (MVar "c") (ENil))))
-            ?mask_add_a_strides
-            (ECons (MNum 0) ?mask_rest_strides)
-            ?mask_add_out_strides)
-            (ICons ?scaled_qk (ICons ?mask (INil)))))
-
-        ; FlashInfer needs qo_indptr/kv_indptr to be recoverable from the mask
-        ; expression. Do not match examples that pass a precomputed mask Input.
-        (= ?mask (Op (Add ?inner_mask_shape ?inner_mask_a_strides ?inner_mask_b_strides ?inner_mask_out_strides)
-            (ICons ?mask_scaled_allowed (ICons ?mask_offset (INil)))))
-        (= ?mask_scaled_allowed (Op (Mul ?allowed_shape ?allowed_strides ?scale_const_strides ?scaled_allowed_strides)
-            (ICons ?mask_allowed (ICons ?mask_scale_const (INil)))))
-        (= ?mask_scale_const (Op (Constant ?mask_scale_val) (INil)))
-        (> ?mask_scale_val 9999999999.0)
-        (< ?mask_scale_val 10000000001.0)
-
-        ; ── K GQA broadcast: Mul(K_gathered, 1.0) with zero-stride constant ──
-        ; Shape: (nheads, hdim, c) — 3D
-        (= ?k_gqa_const (Op (Constant 1.000000) (INil)))
-        (= ?k_gqa (Op (Mul
-            (ECons ?nheads6 (ECons ?hdim6 (ECons (MVar "c") (ENil))))
-            ?k_gqa_a_strides
-            (ECons (MNum 0) (ECons (MNum 0) (ECons (MNum 0) (ENil))))
-            ?k_gqa_out_strides)
-            (ICons ?k_gathered (ICons ?k_gqa_const (INil)))))
-
-        ; ── K Gather: rows from K_cache (2D) ──
-        ; Shape: (c, kvdim), Source: (num_slots, kvdim)
-        (= ?k_gathered (Op (Gather
-            (ECons (MVar "c") (ECons ?kvdim3 (ENil)))
-            ?k_gather_strides
-            (ECons ?num_slots_k (ECons ?kvdim4 (ENil)))
-            ?k_src_strides)
-            (ICons ?k_idx (ICons ?k_cache (INil)))))
-
-        ; ── Dtype consistency ──
-        (= ?dt (dtype ?q))
-        (= ?dt (dtype ?k_cache))
-        (= ?dt (dtype ?v_cache))
-    )
-    (
-        (let ?fi (Op (FlashInferAttention
-            ?nheads (MDiv ?kvdim ?hdim) ?hdim (MNum 1) (MVar "s"))
-            (ICons ?q (ICons ?k_cache (ICons ?v_cache (ICons ?k_idx (ICons ?mask (INil))))))))
-        (union ?output ?fi)
-        (set (dtype ?fi) ?dt)
-    )
-    :ruleset matmul_backend
-    :name "FlashInfer batch decode attention"
-)
--- a/crates/luminal_cuda_lite/src/host/flashinfer/jit.rs
+++ b/crates/luminal_cuda_lite/src/host/flashinfer/jit.rs
@@ -1,504 +0,0 @@
-//! JIT compilation and dynamic loading of FlashInfer kernels.
-//!
-//! Everything runs at compile / profiling time — there is no `build.rs`.
-//! `wrapper.cu` and `wrapper.h` are embedded via `include_str!()` and
-//! extracted to the cache directory on first use. The FlashInfer + CUTLASS
-//! header trees are located by probing `LUMINAL_FLASHINFER_DIR`, a small set
-//! of default paths, and (as a last resort) by `git clone`-ing FlashInfer at
-//! a pinned commit into the cache. `nvcc` is then invoked with the model's
-//! actual `HEAD_DIM` and the resulting `.so` is `dlopen`'d.
-//!
-//! `ensure_compiled` is called from `FlashInferAttention::extract()`, i.e.
-//! during luminal's compile / GA-profiling phase, not from `execute()`. After
-//! the first call the `OnceLock` makes subsequent lookups free.
-
-use std::{
-    ffi::c_void,
-    hash::{Hash, Hasher},
-    path::{Path, PathBuf},
-    process::Command,
-    sync::OnceLock,
-};
-
-// ── Function pointer types matching wrapper.h ──
-
-pub type PlanFn = unsafe extern "C" fn(
-    float_workspace: *mut c_void,
-    float_ws_size: usize,
-    int_workspace: *mut c_void,
-    int_ws_size: usize,
-    page_locked_int_workspace: *mut c_void,
-    indptr_h: *mut i32,
-    batch_size: i32,
-    num_qo_heads: i32,
-    num_kv_heads: i32,
-    page_size: i32,
-    head_dim: i32,
-    stream: *mut c_void,
-    plan_info_out: *mut i64,
-    plan_info_len_out: *mut i32,
-) -> i32;
-
-pub type RunFn = unsafe extern "C" fn(
-    float_workspace: *mut c_void,
-    float_ws_size: usize,
-    int_workspace: *mut c_void,
-    plan_info_vec: *mut i64,
-    plan_info_len: i32,
-    q: *mut f32,
-    k_cache: *mut f32,
-    v_cache: *mut f32,
-    kv_indptr: *mut i32,
-    kv_indices: *mut i32,
-    kv_last_page_len: *mut i32,
-    output: *mut f32,
-    batch_size: i32,
-    num_qo_heads: i32,
-    num_kv_heads: i32,
-    page_size: i32,
-    head_dim: i32,
-    stream: *mut c_void,
-) -> i32;
-
-pub type ExtractFn = unsafe extern "C" fn(
-    flat_idx: *const i32,
-    out: *mut i32,
-    c: i32,
-    kv_dim: i32,
-    stream: *mut c_void,
-);
-
-pub type DeriveIndptrFn =
-    unsafe extern "C" fn(mask: *const f32, indptr: *mut i32, s: i32, c: i32, stream: *mut c_void);
-
-pub type TransposeOutputFn = unsafe extern "C" fn(
-    src: *const f32,
-    dst: *mut f32,
-    batch: i32,
-    heads: i32,
-    dim: i32,
-    stream: *mut c_void,
-);
-
-pub type PrefillPlanFn = unsafe extern "C" fn(
-    float_workspace: *mut c_void,
-    float_ws_size: usize,
-    int_workspace: *mut c_void,
-    int_ws_size: usize,
-    page_locked_int_workspace: *mut c_void,
-    qo_indptr_h: *mut i32,
-    kv_indptr_h: *mut i32,
-    total_num_rows: i32,
-    batch_size: i32,
-    num_qo_heads: i32,
-    num_kv_heads: i32,
-    page_size: i32,
-    head_dim: i32,
-    stream: *mut c_void,
-    plan_info_out: *mut i64,
-    plan_info_len_out: *mut i32,
-) -> i32;
-
-pub type PrefillRunFn = unsafe extern "C" fn(
-    float_workspace: *mut c_void,
-    float_ws_size: usize,
-    int_workspace: *mut c_void,
-    plan_info_vec: *mut i64,
-    plan_info_len: i32,
-    q: *mut f32,
-    k_cache: *mut f32,
-    v_cache: *mut f32,
-    qo_indptr: *mut i32,
-    kv_indptr: *mut i32,
-    kv_indices: *mut i32,
-    kv_last_page_len: *mut i32,
-    output: *mut f32,
-    total_num_rows: i32,
-    batch_size: i32,
-    num_qo_heads: i32,
-    num_kv_heads: i32,
-    page_size: i32,
-    head_dim: i32,
-    stream: *mut c_void,
-) -> i32;
-
-// ── Embedded CUDA sources ──
-
-const WRAPPER_CU: &str = include_str!("wrapper.cu");
-const WRAPPER_H: &str = include_str!("wrapper.h");
-
-// ── Loaded library handle ──
-
-pub struct FlashInferLib {
-    // Keep the handle alive so the dlopen'd .so remains mapped.
-    _lib: libloading::Library,
-    pub plan: PlanFn,
-    pub run: RunFn,
-    pub extract_slot_indices: ExtractFn,
-    pub derive_indptr_from_mask: DeriveIndptrFn,
-    pub transpose_output: TransposeOutputFn,
-    pub prefill_plan: PrefillPlanFn,
-    pub prefill_run: PrefillRunFn,
-}
-
-// SAFETY: The library handle and function pointers are valid for the lifetime
-// of the process. All functions are called with proper CUDA stream serialization.
-unsafe impl Send for FlashInferLib {}
-unsafe impl Sync for FlashInferLib {}
-
-static FLASHINFER_LIB: OnceLock<FlashInferLib> = OnceLock::new();
-
-/// Ensure the FlashInfer library is compiled and loaded for the given HEAD_DIM.
-/// Returns a reference to the loaded library. Thread-safe via OnceLock.
-pub fn ensure_compiled(head_dim: usize) -> &'static FlashInferLib {
-    FLASHINFER_LIB.get_or_init(|| {
-        assert!(
-            matches!(head_dim, 64 | 128 | 256),
-            "FlashInfer: unsupported HEAD_DIM={} (must be 64, 128, or 256 for f32)",
-            head_dim
-        );
-        let so_path = compile_or_cache(head_dim);
-        unsafe {
-            FlashInferLib::load(&so_path)
-                .unwrap_or_else(|e| panic!("Failed to load FlashInfer library: {e}"))
-        }
-    })
-}
-
-impl FlashInferLib {
-    /// Load a compiled FlashInfer .so and resolve function pointers.
-    ///
-    /// # Safety
-    /// The .so must be a valid FlashInfer wrapper compiled from wrapper.cu.
-    unsafe fn load(path: &Path) -> Result<Self, libloading::Error> {
-        let lib = unsafe { libloading::Library::new(path)? };
-        let plan: PlanFn = unsafe { *lib.get::<PlanFn>(b"flashinfer_batch_decode_plan\0")? };
-        let run: RunFn = unsafe { *lib.get::<RunFn>(b"flashinfer_batch_decode_run\0")? };
-        let extract_slot_indices: ExtractFn =
-            unsafe { *lib.get::<ExtractFn>(b"flashinfer_extract_slot_indices\0")? };
-        let derive_indptr_from_mask: DeriveIndptrFn =
-            unsafe { *lib.get::<DeriveIndptrFn>(b"flashinfer_derive_indptr_from_mask\0")? };
-        let transpose_output: TransposeOutputFn =
-            unsafe { *lib.get::<TransposeOutputFn>(b"flashinfer_transpose_output\0")? };
-        let prefill_plan: PrefillPlanFn =
-            unsafe { *lib.get::<PrefillPlanFn>(b"flashinfer_batch_prefill_plan\0")? };
-        let prefill_run: PrefillRunFn =
-            unsafe { *lib.get::<PrefillRunFn>(b"flashinfer_batch_prefill_run\0")? };
-        Ok(Self {
-            _lib: lib,
-            plan,
-            run,
-            extract_slot_indices,
-            derive_indptr_from_mask,
-            transpose_output,
-            prefill_plan,
-            prefill_run,
-        })
-    }
-}
-
-/// Compile wrapper.cu for the given HEAD_DIM, or return cached .so path.
-fn compile_or_cache(head_dim: usize) -> PathBuf {
-    let cache_dir = cache_directory();
-    std::fs::create_dir_all(&cache_dir).expect("Failed to create FlashInfer cache directory");
-
-    // Extract bundled wrapper sources to the cache so nvcc can compile them.
-    let (wrapper_cu_path, wrapper_h_dir) = extract_wrapper_sources(&cache_dir);
-
-    let arch = detect_cuda_arch();
-    // Bake a hash of the embedded wrapper into the .so name so old caches are
-    // discarded automatically when wrapper.cu or wrapper.h change.
-    let wrapper_hash = wrapper_source_hash();
-    let so_name = format!(
-        "libflashinfer_hd{}_{}_w{:016x}.so",
-        head_dim, arch, wrapper_hash
-    );
-    let so_path = cache_dir.join(&so_name);
-
-    if so_path.exists() {
-        eprintln!(
-            "FlashInfer: using cached library for HEAD_DIM={} ({})",
-            head_dim,
-            so_path.display()
-        );
-        return so_path;
-    }
-
-    let Some((flashinfer_include, cutlass_include)) = locate_flashinfer_includes() else {
-        panic!(
-            "FlashInfer: could not locate header tree. Set LUMINAL_FLASHINFER_DIR to the \
-             FlashInfer source root (the directory containing `include/` and \
-             `3rdparty/cutlass/include/`)."
-        );
-    };
-
-    eprintln!(
-        "FlashInfer: JIT compiling for HEAD_DIM={}, arch={} ...",
-        head_dim, arch
-    );
-    let start = std::time::Instant::now();
-
-    let output = Command::new("nvcc")
-        .args([
-            "-shared",
-            "-o",
-            so_path.to_str().unwrap(),
-            &format!("-DLUMINAL_HEAD_DIM={}", head_dim),
-            wrapper_cu_path.to_str().unwrap(),
-            "-I",
-            flashinfer_include.to_str().unwrap(),
-            "-I",
-            cutlass_include.to_str().unwrap(),
-            "-I",
-            wrapper_h_dir.to_str().unwrap(),
-            "-std=c++17",
-            &format!("-arch={}", arch),
-            "-O3",
-            "--expt-relaxed-constexpr",
-            "-w",
-            "-rdc=true",
-            "--compiler-options",
-            "-fPIC",
-        ])
-        .output()
-        .expect("Failed to run nvcc. Is the CUDA toolkit installed?");
-
-    if !output.status.success() {
-        let stderr = String::from_utf8_lossy(&output.stderr);
-        let stdout = String::from_utf8_lossy(&output.stdout);
-        let _ = std::fs::remove_file(&so_path);
-        panic!(
-            "FlashInfer JIT compilation failed (HEAD_DIM={}, arch={}):\nstdout: {}\nstderr: {}",
-            head_dim, arch, stdout, stderr
-        );
-    }
-
-    let elapsed = start.elapsed();
-    eprintln!(
-        "FlashInfer: compiled in {:.1}s → {}",
-        elapsed.as_secs_f64(),
-        so_path.display()
-    );
-
-    so_path
-}
-
-/// Returns ~/.cache/luminal/flashinfer/
-fn cache_directory() -> PathBuf {
-    let home = std::env::var("HOME").unwrap_or_else(|_| "/tmp".to_string());
-    PathBuf::from(home)
-        .join(".cache")
-        .join("luminal")
-        .join("flashinfer")
-}
-
-/// Drop the embedded wrapper.cu/wrapper.h into the cache dir so nvcc has files
-/// on disk to compile. Returns (wrapper.cu path, directory containing wrapper.h).
-fn extract_wrapper_sources(cache_dir: &Path) -> (PathBuf, PathBuf) {
-    let cu = cache_dir.join("wrapper.cu");
-    let h = cache_dir.join("wrapper.h");
-    write_if_changed(&cu, WRAPPER_CU.as_bytes());
-    write_if_changed(&h, WRAPPER_H.as_bytes());
-    (cu, cache_dir.to_path_buf())
-}
-
-fn write_if_changed(path: &Path, contents: &[u8]) {
-    if let Ok(existing) = std::fs::read(path)
-        && existing == contents
-    {
-        return;
-    }
-    std::fs::write(path, contents).unwrap_or_else(|e| {
-        panic!(
-            "FlashInfer: failed to write wrapper source to {}: {e}",
-            path.display()
-        )
-    });
-}
-
-fn wrapper_source_hash() -> u64 {
-    let mut hasher = std::collections::hash_map::DefaultHasher::new();
-    WRAPPER_CU.hash(&mut hasher);
-    WRAPPER_H.hash(&mut hasher);
-    hasher.finish()
-}
-
-// ── Pinned FlashInfer source ──
-//
-// Bumping this constant invalidates the cached source tree AND the cached .so
-// (the .so cache key incorporates the wrapper hash, which is rebuilt against
-// these headers, so different headers compile to a different .so file even at
-// the same head_dim). If you change `FLASHINFER_GIT_REV`, also re-check
-// `wrapper.cu` against the new FlashInfer API.
-
-const FLASHINFER_GIT_URL: &str = "https://github.com/flashinfer-ai/flashinfer.git";
-const CUTLASS_GIT_URL: &str = "https://github.com/NVIDIA/cutlass.git";
-const FLASHINFER_GIT_REV: &str = "f1e6fdcb8f65104047697f022b5d055ef022d763";
-const CUTLASS_GIT_REV: &str = "f3fde58372d33e9a5650ba7b80fc48b3b49d40c8";
-
-fn locate_flashinfer_includes() -> Option<(PathBuf, PathBuf)> {
-    if let Ok(path) = std::env::var("LUMINAL_FLASHINFER_DIR")
-        && !path.is_empty()
-    {
-        let root = PathBuf::from(path);
-        let inc = root.join("include");
-        let cutlass = root.join("3rdparty/cutlass/include");
-        if inc.exists() && cutlass.exists() {
-            return Some((inc, cutlass));
-        }
-        eprintln!(
-            "FlashInfer: LUMINAL_FLASHINFER_DIR={} did not contain include/ and \
-             3rdparty/cutlass/include/ — falling back to default locations",
-            root.display()
-        );
-    }
-
-    let home = std::env::var("HOME").unwrap_or_default();
-    let candidates = [
-        PathBuf::from(&home).join("luminal_cuda/crates/luminal_cuda/flashinfer"),
-        PathBuf::from(&home).join("luminal_cuda/flashinfer"),
-        PathBuf::from("/opt/luminal_cuda/crates/luminal_cuda/flashinfer"),
-    ];
-    for root in candidates {
-        let inc = root.join("include");
-        let cutlass = root.join("3rdparty/cutlass/include");
-        if inc.exists() && cutlass.exists() {
-            return Some((inc, cutlass));
-        }
-    }
-
-    // Last resort: fetch the pinned commit into the cache directory.
-    fetch_flashinfer_source().ok().map(|root| {
-        let inc = root.join("include");
-        let cutlass = root.join("3rdparty/cutlass/include");
-        (inc, cutlass)
-    })
-}
-
-/// Clone FlashInfer at `FLASHINFER_GIT_REV` + CUTLASS at `CUTLASS_GIT_REV`
-/// into `~/.cache/luminal/flashinfer-src/<short_rev>/` if absent, then return
-/// the FlashInfer root directory. ~50 MB one-time download; subsequent calls
-/// short-circuit on the directory check.
-fn fetch_flashinfer_source() -> Result<PathBuf, String> {
-    let short = &FLASHINFER_GIT_REV[..12];
-    let cache_root = cache_directory().join("flashinfer-src").join(short);
-    let inc = cache_root.join("include");
-    let cutlass_inc = cache_root.join("3rdparty/cutlass/include");
-
-    if inc.exists() && cutlass_inc.exists() {
-        return Ok(cache_root);
-    }
-
-    let parent = cache_root.parent().unwrap();
-    std::fs::create_dir_all(parent)
-        .map_err(|e| format!("failed to create {}: {e}", parent.display()))?;
-
-    // Clone into a staging dir, then atomic rename. Protects against multiple
-    // processes racing to fetch the same source.
-    let staging = parent.join(format!(".staging-{}-{}", short, std::process::id()));
-    let _ = std::fs::remove_dir_all(&staging);
-
-    eprintln!(
-        "FlashInfer: cloning {FLASHINFER_GIT_URL} @ {short} into {} (one-time fetch, ~50 MB) …",
-        cache_root.display()
-    );
-
-    run_git(&[
-        "clone",
-        "--filter=blob:none",
-        "--no-checkout",
-        FLASHINFER_GIT_URL,
-        staging.to_str().unwrap(),
-    ])?;
-    run_git_in(&staging, &["checkout", FLASHINFER_GIT_REV])?;
-
-    // Init only the CUTLASS submodule (skip spdlog — we don't need it for kernels).
-    let cutlass_path = staging.join("3rdparty/cutlass");
-    let _ = std::fs::remove_dir_all(&cutlass_path);
-    run_git(&[
-        "clone",
-        "--filter=blob:none",
-        "--no-checkout",
-        CUTLASS_GIT_URL,
-        cutlass_path.to_str().unwrap(),
-    ])?;
-    run_git_in(&cutlass_path, &["checkout", CUTLASS_GIT_REV])?;
-
-    if !staging.join("include").exists() {
-        return Err(format!(
-            "FlashInfer clone succeeded but include/ missing at {}",
-            staging.display()
-        ));
-    }
-    if !staging.join("3rdparty/cutlass/include").exists() {
-        return Err(format!(
-            "CUTLASS clone succeeded but include/ missing at {}",
-            staging.join("3rdparty/cutlass").display()
-        ));
-    }
-
-    // Atomic-ish rename. If another process beat us to it, just keep theirs.
-    match std::fs::rename(&staging, &cache_root) {
-        Ok(()) => {}
-        Err(_) if cache_root.exists() => {
-            let _ = std::fs::remove_dir_all(&staging);
-        }
-        Err(e) => return Err(format!("rename to {} failed: {e}", cache_root.display())),
-    }
-
-    Ok(cache_root)
-}
-
-fn run_git(args: &[&str]) -> Result<(), String> {
-    let out = Command::new("git")
-        .args(args)
-        .output()
-        .map_err(|e| format!("failed to spawn `git`: {e}. Is git installed?"))?;
-    if !out.status.success() {
-        return Err(format!(
-            "`git {}` failed: {}",
-            args.join(" "),
-            String::from_utf8_lossy(&out.stderr)
-        ));
-    }
-    Ok(())
-}
-
-fn run_git_in(cwd: &Path, args: &[&str]) -> Result<(), String> {
-    let out = Command::new("git")
-        .args(args)
-        .current_dir(cwd)
-        .output()
-        .map_err(|e| format!("failed to spawn `git`: {e}"))?;
-    if !out.status.success() {
-        return Err(format!(
-            "`git {}` in {} failed: {}",
-            args.join(" "),
-            cwd.display(),
-            String::from_utf8_lossy(&out.stderr)
-        ));
-    }
-    Ok(())
-}
-
-/// Detect CUDA arch via env override → nvidia-smi → default sm_80.
-fn detect_cuda_arch() -> String {
-    if let Ok(arch) = std::env::var("FLASHINFER_CUDA_ARCH") {
-        return arch;
-    }
-
-    if let Ok(output) = Command::new("nvidia-smi")
-        .args(["--query-gpu=compute_cap", "--format=csv,noheader"])
-        .output()
-        && output.status.success()
-    {
-        let cap = String::from_utf8_lossy(&output.stdout);
-        let cap = cap.trim().lines().next().unwrap_or("8.0");
-        let sm = cap.replace('.', "");
-        if !sm.is_empty() {
-            return format!("sm_{}", sm);
-        }
-    }
-
-    "sm_80".to_string()
-}
--- a/crates/luminal_cuda_lite/src/host/flashinfer/mod.rs
+++ b/crates/luminal_cuda_lite/src/host/flashinfer/mod.rs
@@ -1,424 +0,0 @@
-pub mod find_indptrs;
-pub mod jit;
-
-use std::sync::{Arc, Mutex, OnceLock};
-
-use luminal::{
-    egglog_utils::{
-        api::{Rule, SortDef, sort},
-        base::{EXPRESSION, OP_KIND},
-        extract_expr,
-    },
-    op::{EgglogOp, LLIROp},
-    prelude::{
-        tracing::{Level, span},
-        *,
-    },
-};
-
-use crate::{
-    cudarc::driver::{CudaSlice, CudaStream, DevicePtr, result},
-    host::{DeviceBuffer, HostOp},
-};
-
-/// FlashInfer attention op (batch decode, fp32).
-///
-/// Replaces the full paged-GQA attention pattern (gather → broadcast → Q*K^T →
-/// scale → mask → softmax → *V) with a single FlashInfer fused kernel.
-///
-/// Graph inputs (7): Q, K_pool, V_pool, flat_gather_idx, mask, qo_indptr, kv_indptr.
-/// The egglog rule captures the first 5; `extract()` appends qo/kv indptrs after
-/// walking the e-graph from the mask. `batch_size` is derived at runtime from the
-/// indptr length (= num_sequences + 1).
-#[derive(Debug)]
-pub struct FlashInferAttention {
-    pub num_qo_heads: usize,
-    pub num_kv_heads: usize,
-    pub head_dim: usize,
-    pub page_size: usize,
-    pub batch_dim: Expression,
-
-    pub plan_info: Mutex<Vec<i64>>,
-}
-
-// SAFETY: PAGE_LOCKED_WORKSPACE holds a raw pointer to page-locked CUDA memory
-// allocated once and serialized via the CUDA stream that owns it.
-unsafe impl Send for FlashInferAttention {}
-unsafe impl Sync for FlashInferAttention {}
-
-const FLOAT_WORKSPACE_SIZE: usize = 128 * 1024 * 1024; // 128 MiB
-const INT_WORKSPACE_SIZE: usize = 8 * 1024 * 1024; // 8 MiB
-
-static PAGE_LOCKED_WORKSPACE: OnceLock<PageLockedPtr> = OnceLock::new();
-
-struct PageLockedPtr(*mut u8);
-
-// SAFETY: The pointer is page-locked CUDA memory allocated once via
-// posix_memalign + cudaHostRegister and only mutated during OnceLock
-// initialization.
-unsafe impl Send for PageLockedPtr {}
-unsafe impl Sync for PageLockedPtr {}
-
-impl std::fmt::Debug for PageLockedPtr {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "PageLockedPtr({:p})", self.0)
-    }
-}
-
-impl Default for FlashInferAttention {
-    fn default() -> Self {
-        Self {
-            num_qo_heads: 0,
-            num_kv_heads: 0,
-            head_dim: 0,
-            page_size: 0,
-            batch_dim: Expression::default(),
-            plan_info: Mutex::new(Vec::new()),
-        }
-    }
-}
-
-impl EgglogOp for FlashInferAttention {
-    fn sort(&self) -> SortDef {
-        sort(
-            OP_KIND,
-            "FlashInferAttention",
-            &[
-                ("num_qo_heads", EXPRESSION),
-                ("num_kv_heads", EXPRESSION),
-                ("head_dim", EXPRESSION),
-                ("page_size", EXPRESSION),
-                ("batch_dim", EXPRESSION),
-            ],
-        )
-    }
-
-    fn n_inputs(&self) -> usize {
-        // Q, K_pool, V_pool, flat_gather_idx, mask (egglog IList).
-        // extract() appends qo_indptr + kv_indptr → 7 actual inputs at runtime.
-        5
-    }
-
-    fn rewrites(&self) -> Vec<Rule> {
-        vec![Rule::raw(include_str!["flashinfer_attention.egg"])]
-    }
-
-    fn extract<'a>(
-        &'a self,
-        egraph: &'a luminal::egglog_utils::SerializedEGraph,
-        kind_children: &[&'a ENodeId],
-        input_enodes: Vec<&'a ENodeId>,
-        _list_cache: &mut FxHashMap<&'a ENodeId, Vec<Expression>>,
-        expr_cache: &mut FxHashMap<&'a ENodeId, Expression>,
-    ) -> (LLIROp, Vec<&'a ENodeId>) {
-        let num_qo_heads = extract_expr(egraph, kind_children[0], expr_cache)
-            .unwrap()
-            .exec(&FxHashMap::default())
-            .unwrap();
-        let num_kv_heads = extract_expr(egraph, kind_children[1], expr_cache)
-            .unwrap()
-            .exec(&FxHashMap::default())
-            .unwrap();
-        let head_dim = extract_expr(egraph, kind_children[2], expr_cache)
-            .unwrap()
-            .exec(&FxHashMap::default())
-            .unwrap();
-        let page_size = extract_expr(egraph, kind_children[3], expr_cache)
-            .unwrap()
-            .exec(&FxHashMap::default())
-            .unwrap();
-        let batch_dim = extract_expr(egraph, kind_children[4], expr_cache).unwrap();
-
-        let extracted = Self {
-            num_qo_heads,
-            num_kv_heads,
-            head_dim,
-            page_size,
-            batch_dim,
-            plan_info: Mutex::new(Vec::new()),
-        };
-
-        // Trigger JIT compilation (or .so cache hit) at extract time, not at
-        // first execute. Pays the ~30s cold-cache nvcc cost during compile
-        // rather than during the GA profiling loop, where it would dominate
-        // the candidate's measured runtime and make the GA reject FlashInfer.
-        let _ = jit::ensure_compiled(head_dim);
-
-        // Walk the mask e-graph chain to recover qo_indptr / kv_indptr Input nodes.
-        // input_enodes: [Q, K_cache, V_cache, gather_idx, mask]
-        let mask_node = input_enodes[4];
-        let indptrs = find_indptrs::find_indptr_inputs(egraph, mask_node);
-
-        // Build final inputs: [Q, K_cache, V_cache, gather_idx, mask, qo_indptr, kv_indptr]
-        let mut final_inputs = input_enodes;
-        final_inputs.push(indptrs.qo_indptr);
-        final_inputs.push(indptrs.kv_indptr);
-
-        let op = LLIROp::new::<dyn HostOp>(Box::new(extracted) as Box<dyn HostOp>);
-        (op, final_inputs)
-    }
-
-    fn cleanup(&self) -> bool {
-        false
-    }
-}
-
-impl HostOp for FlashInferAttention {
-    fn execute(
-        &self,
-        stream: &Arc<CudaStream>,
-        self_node: NodeIndex,
-        inputs: &[NodeIndex],
-        buffers: &FxHashMap<NodeIndex, DeviceBuffer>,
-        dyn_map: &FxHashMap<char, usize>,
-    ) -> anyhow::Result<()> {
-        let lib = jit::ensure_compiled(self.head_dim);
-
-        let total_q_tokens = self
-            .batch_dim
-            .exec(dyn_map)
-            .ok_or_else(|| anyhow::anyhow!("FlashInferAttention batch_dim is unresolved"))?;
-        let c = *dyn_map
-            .get(&'c')
-            .ok_or_else(|| anyhow::anyhow!("FlashInferAttention requires dynamic dim 'c'"))?;
-        let r = *dyn_map
-            .get(&'r')
-            .ok_or_else(|| anyhow::anyhow!("FlashInferAttention requires dynamic dim 'r'"))?;
-
-        if inputs.len() < 7 {
-            anyhow::bail!(
-                "FlashInferAttention expects 7 inputs (Q, K, V, flat_idx, mask, qo_indptr, kv_indptr), got {}",
-                inputs.len()
-            );
-        }
-
-        let get_buf = |name: &str, node: NodeIndex| -> anyhow::Result<DeviceBuffer> {
-            buffers.get(&node).copied().ok_or_else(|| {
-                anyhow::anyhow!("FlashInferAttention missing {name} buffer for {node:?}")
-            })
-        };
-
-        let q_buf = get_buf("Q", inputs[0])?;
-        let k_buf = get_buf("K_cache", inputs[1])?;
-        let v_buf = get_buf("V_cache", inputs[2])?;
-        let flat_idx_buf = get_buf("flat_gather_idx", inputs[3])?;
-        // inputs[4] = mask (unused by FlashInfer — indptrs replace it)
-        let kv_indptr_buf = get_buf("kv_indptr", inputs[6])?;
-        let out_buf = get_buf("output", self_node)?;
-
-        // Derive batch_size (num sequences) from r = indptr length.
-        let batch_size = r.saturating_sub(1);
-
-        let _span = span!(
-            Level::TRACE,
-            "FlashInferAttention",
-            total_q_tokens,
-            batch_size,
-            self.num_qo_heads,
-            self.num_kv_heads,
-            self.head_dim,
-        )
-        .entered();
-
-        let kv_dim = self.num_kv_heads * self.head_dim;
-        let cu_stream = stream.cu_stream() as *mut std::ffi::c_void;
-
-        // Extract slot indices (one per context page) from the flat gather index.
-        let indices_buf = unsafe { stream.alloc::<u8>(c.max(1) * std::mem::size_of::<i32>())? };
-        let (indices_ptr, _idx_guard) = indices_buf.device_ptr(stream);
-
-        if c > 0 {
-            unsafe {
-                (lib.extract_slot_indices)(
-                    flat_idx_buf.ptr() as *const i32,
-                    indices_ptr as *mut i32,
-                    c as i32,
-                    kv_dim as i32,
-                    cu_stream,
-                );
-            }
-        }
-
-        // Read kv_indptr to host for the plan phase.
-        let kv_indptr_bytes = r * 4;
-        let mut kv_indptr_host_bytes = vec![0u8; kv_indptr_bytes];
-        unsafe {
-            result::memcpy_dtoh_async(
-                &mut kv_indptr_host_bytes,
-                kv_indptr_buf.ptr(),
-                stream.cu_stream(),
-            )?;
-        }
-        stream.synchronize()?;
-        let kv_indptr_host: Vec<i32> = unsafe {
-            let mut v = std::mem::ManuallyDrop::new(kv_indptr_host_bytes);
-            Vec::from_raw_parts(v.as_mut_ptr() as *mut i32, r, r)
-        };
-
-        // kv_last_page_len = [1; batch_size] when page_size=1.
-        let last_page_host: Vec<i32> = vec![1; batch_size];
-        let last_page_dev: CudaSlice<u8> = if batch_size > 0 {
-            stream.clone_htod(unsafe {
-                std::slice::from_raw_parts(
-                    last_page_host.as_ptr() as *const u8,
-                    last_page_host.len() * std::mem::size_of::<i32>(),
-                )
-            })?
-        } else {
-            unsafe { stream.alloc::<u8>(1)? }
-        };
-        let (last_page_ptr, _lp_guard) = last_page_dev.device_ptr(stream);
-
-        // Global shared workspaces (allocated once across all op instances to
-        // amortize the ~4ms first-allocation cost during GA profiling).
-        static FLOAT_WORKSPACE: OnceLock<CudaSlice<u8>> = OnceLock::new();
-        static INT_WORKSPACE: OnceLock<CudaSlice<u8>> = OnceLock::new();
-        let float_ws = FLOAT_WORKSPACE
-            .get_or_init(|| unsafe { stream.alloc::<u8>(FLOAT_WORKSPACE_SIZE).unwrap() });
-        let int_ws = INT_WORKSPACE
-            .get_or_init(|| unsafe { stream.alloc::<u8>(INT_WORKSPACE_SIZE).unwrap() });
-        let page_locked_ws = PAGE_LOCKED_WORKSPACE.get_or_init(|| unsafe {
-            let mut ptr: *mut std::ffi::c_void = std::ptr::null_mut();
-            let status = libc::posix_memalign(&mut ptr, 4096, INT_WORKSPACE_SIZE);
-            assert_eq!(status, 0, "Failed to allocate page-locked workspace");
-            let cuda_status = cuda_pin_memory(ptr, INT_WORKSPACE_SIZE);
-            assert_eq!(cuda_status, 0, "Failed to pin memory");
-            PageLockedPtr(ptr as *mut u8)
-        });
-
-        let (float_ws_ptr, _fws_guard) = float_ws.device_ptr(stream);
-        let (int_ws_ptr, _iws_guard) = int_ws.device_ptr(stream);
-
-        // FlashInfer decode writes (total_q_tokens, heads, dim);
-        // luminal expects (heads, total_q_tokens, dim) — transpose at the end.
-        let output_elems = total_q_tokens * self.num_qo_heads * self.head_dim;
-        let temp_out_buf =
-            unsafe { stream.alloc::<u8>(output_elems * std::mem::size_of::<f32>())? };
-        let (temp_out_ptr, _tmp_guard) = temp_out_buf.device_ptr(stream);
-
-        // PrefillPlanInfo has 15 entries, DecodePlanInfo fewer — 16 is enough.
-        let mut plan_info_buf = [0i64; 16];
-        let mut plan_info_len: i32 = 0;
-
-        // ── BatchDecode path ──
-        // Prefill kernels require fp16/bf16 tensor-core MMA; the C API returns -1
-        // when called from the fp32 pipeline. We only use decode here.
-        let plan_ret = unsafe {
-            (lib.plan)(
-                float_ws_ptr as *mut std::ffi::c_void,
-                FLOAT_WORKSPACE_SIZE,
-                int_ws_ptr as *mut std::ffi::c_void,
-                INT_WORKSPACE_SIZE,
-                page_locked_ws.0 as *mut std::ffi::c_void,
-                kv_indptr_host.as_ptr() as *mut i32,
-                batch_size as i32,
-                self.num_qo_heads as i32,
-                self.num_kv_heads as i32,
-                self.page_size as i32,
-                self.head_dim as i32,
-                cu_stream,
-                plan_info_buf.as_mut_ptr(),
-                &mut plan_info_len,
-            )
-        };
-        if plan_ret != 0 {
-            return Err(anyhow::anyhow!(
-                "FlashInfer decode plan failed with error code {plan_ret}"
-            ));
-        }
-
-        let mut plan_info = self.plan_info.lock().unwrap();
-        plan_info.clear();
-        plan_info.extend_from_slice(&plan_info_buf[..plan_info_len as usize]);
-
-        let run_ret = unsafe {
-            (lib.run)(
-                float_ws_ptr as *mut std::ffi::c_void,
-                FLOAT_WORKSPACE_SIZE,
-                int_ws_ptr as *mut std::ffi::c_void,
-                plan_info.as_mut_ptr(),
-                plan_info.len() as i32,
-                q_buf.ptr() as *mut f32,
-                k_buf.ptr() as *mut f32,
-                v_buf.ptr() as *mut f32,
-                kv_indptr_buf.ptr() as *mut i32,
-                indices_ptr as *mut i32,
-                last_page_ptr as *mut i32,
-                temp_out_ptr as *mut f32,
-                batch_size as i32,
-                self.num_qo_heads as i32,
-                self.num_kv_heads as i32,
-                self.page_size as i32,
-                self.head_dim as i32,
-                cu_stream,
-            )
-        };
-        drop(plan_info);
-
-        if run_ret != 0 {
-            return Err(anyhow::anyhow!(
-                "FlashInfer decode run failed with error code {run_ret}"
-            ));
-        }
-
-        // Transpose (total_q_tokens, heads, dim) → (heads, total_q_tokens, dim)
-        unsafe {
-            (lib.transpose_output)(
-                temp_out_ptr as *const f32,
-                out_buf.ptr() as *mut f32,
-                total_q_tokens as i32,
-                self.num_qo_heads as i32,
-                self.head_dim as i32,
-                cu_stream,
-            );
-        }
-
-        Ok(())
-    }
-
-    fn output_size(&self) -> Expression {
-        self.batch_dim * self.num_qo_heads * self.head_dim
-    }
-
-    fn output_bytes(&self) -> Expression {
-        self.output_size() * 4
-    }
-
-    fn stats_name(&self) -> Option<&'static str> {
-        Some("FlashInferAttention")
-    }
-}
-
-/// Pin host memory for CUDA async memcpy.
-///
-/// `cudaHostRegister` lives in libcudart, which cudarc doesn't link to our
-/// binary. Resolve it via `dlopen`/`dlsym` so we don't need a build script or
-/// a `#[link]` directive — keeping the crate buildable without any nvcc-side
-/// dependencies.
-unsafe fn cuda_pin_memory(ptr: *mut std::ffi::c_void, size: usize) -> i32 {
-    type HostRegisterFn = unsafe extern "C" fn(*mut std::ffi::c_void, usize, u32) -> i32;
-    static FN: OnceLock<usize> = OnceLock::new();
-
-    let raw = *FN.get_or_init(|| unsafe {
-        let lib = [
-            "libcudart.so",
-            "libcudart.so.13",
-            "libcudart.so.12",
-            "libcudart.so.11",
-        ]
-        .iter()
-        .find_map(|n| libloading::Library::new(*n).ok())
-        .expect("FlashInfer: could not dlopen libcudart for cudaHostRegister");
-        let sym: libloading::Symbol<HostRegisterFn> = lib
-            .get(b"cudaHostRegister\0")
-            .expect("FlashInfer: libcudart missing cudaHostRegister symbol");
-        let ptr = *sym as *const () as usize;
-        // Keep libcudart resident for the process lifetime so the function
-        // pointer remains valid.
-        std::mem::forget(lib);
-        ptr
-    });
-    let f: HostRegisterFn = unsafe { std::mem::transmute(raw) };
-    // cudaHostRegisterDefault = 0
-    unsafe { f(ptr, size, 0) }
-}
--- a/crates/luminal_cuda_lite/src/host/flashinfer/wrapper.cu
+++ b/crates/luminal_cuda_lite/src/host/flashinfer/wrapper.cu
@@ -1,357 +0,0 @@
-// FlashInfer batch decode + prefill wrapper for luminal_cuda.
-// JIT-compiled at runtime with -DLUMINAL_HEAD_DIM=N.
-//
-// Decode: instantiated for f32 (scalar vectorized dot products, no tensor cores).
-// Prefill: instantiated for f16 (requires tensor core MMA + ldmatrix).
-//   The C API accepts fp32 buffers; cast kernels convert fp32↔fp16 at the boundary.
-//
-// NHD layout. GQA group_size and page_size are runtime parameters.
-
-#ifndef LUMINAL_HEAD_DIM
-#error "LUMINAL_HEAD_DIM must be defined (e.g. -DLUMINAL_HEAD_DIM=128)"
-#endif
-
-// Include utils.cuh first to get the original DISPATCH_HEAD_DIM, then override it
-// to only instantiate our specific HEAD_DIM. This avoids a compile error in
-// cascade.cuh where HEAD_DIM=512 + f32 triggers vec_size=16, vec_bits=512
-// which exceeds cp_async's 256-bit limit.
-#include <flashinfer/utils.cuh>
-#undef DISPATCH_HEAD_DIM
-#define DISPATCH_HEAD_DIM(head_dim, HEAD_DIM, ...)  \
-  {                                                  \
-    constexpr size_t HEAD_DIM = LUMINAL_HEAD_DIM;    \
-    __VA_ARGS__                                      \
-  }
-
-#include <flashinfer/attention/scheduler.cuh>
-#include <flashinfer/attention/decode.cuh>
-#include <flashinfer/attention/default_decode_params.cuh>
-#include <flashinfer/attention/prefill.cuh>
-#include <flashinfer/attention/default_prefill_params.cuh>
-#include <flashinfer/attention/mask.cuh>
-#include <flashinfer/attention/variants.cuh>
-#include <flashinfer/page.cuh>
-#include <flashinfer/pos_enc.cuh>
-
-#include "wrapper.h"
-
-#include <cstring>
-#include <vector>
-#include <cuda_fp16.h>
-
-using namespace flashinfer;
-
-// ── Decode types (f32) ──
-using DTypeQ = float;
-using DTypeKV = float;
-using DTypeO = float;
-using IdType = int32_t;
-
-// ── Prefill types (f16 compute, fp32 external interface) ──
-using PrefillDTypeQ = half;
-using PrefillDTypeKV = half;
-using PrefillDTypeO = half;
-
-constexpr uint32_t HEAD_DIM = LUMINAL_HEAD_DIM;
-constexpr PosEncodingMode POS_ENCODING_MODE = PosEncodingMode::kNone;
-
-// Attention variants
-using Variant = DefaultAttention</*use_custom_mask=*/false,
-                                  /*use_sliding_window=*/false,
-                                  /*use_logits_soft_cap=*/false,
-                                  /*use_alibi=*/false>;
-
-using CausalVariant = DefaultAttention</*use_custom_mask=*/false,
-                                        /*use_sliding_window=*/false,
-                                        /*use_logits_soft_cap=*/false,
-                                        /*use_alibi=*/false>;
-
-// Decode params (f32)
-using DecodeParams = BatchDecodeParams<DTypeQ, DTypeKV, DTypeO, IdType>;
-
-// Prefill params (f16)
-using PrefillParams = BatchPrefillPagedParams<PrefillDTypeQ, PrefillDTypeKV, PrefillDTypeO, IdType>;
-
-// Forward declarations
-namespace flashinfer {
-template <uint32_t HEAD_DIM, PosEncodingMode POS_ENCODING_MODE, typename AttentionVariant,
-          typename Params>
-cudaError_t BatchDecodeWithPagedKVCacheDispatched(Params params, typename Params::DTypeO* tmp_v,
-                                                   float* tmp_s, bool enable_pdl,
-                                                   cudaStream_t stream);
-
-template <uint32_t CTA_TILE_Q, uint32_t HEAD_DIM_QK, uint32_t HEAD_DIM_VO,
-          PosEncodingMode POS_ENCODING_MODE, bool USE_FP16_QK_REDUCTION,
-          MaskMode MASK_MODE, typename AttentionVariant, typename Params>
-cudaError_t BatchPrefillWithPagedKVCacheDispatched(Params params, typename Params::DTypeO* tmp_v,
-                                                    float* tmp_s, bool enable_pdl,
-                                                    cudaStream_t stream);
-}
-
-// Explicit instantiation: decode kernel (f32)
-template cudaError_t flashinfer::BatchDecodeWithPagedKVCacheDispatched<
-    HEAD_DIM, POS_ENCODING_MODE, Variant, DecodeParams>(
-    DecodeParams params, DTypeO* tmp_v, float* tmp_s, bool enable_pdl, cudaStream_t stream);
-
-// Explicit instantiation: prefill kernels (f16, causal mask, CTA_TILE_Q=16/64/128)
-template cudaError_t flashinfer::BatchPrefillWithPagedKVCacheDispatched<
-    16, HEAD_DIM, HEAD_DIM, POS_ENCODING_MODE, false, MaskMode::kCausal, CausalVariant, PrefillParams>(
-    PrefillParams params, PrefillDTypeO* tmp_v, float* tmp_s, bool enable_pdl, cudaStream_t stream);
-
-template cudaError_t flashinfer::BatchPrefillWithPagedKVCacheDispatched<
-    64, HEAD_DIM, HEAD_DIM, POS_ENCODING_MODE, false, MaskMode::kCausal, CausalVariant, PrefillParams>(
-    PrefillParams params, PrefillDTypeO* tmp_v, float* tmp_s, bool enable_pdl, cudaStream_t stream);
-
-template cudaError_t flashinfer::BatchPrefillWithPagedKVCacheDispatched<
-    128, HEAD_DIM, HEAD_DIM, POS_ENCODING_MODE, false, MaskMode::kCausal, CausalVariant, PrefillParams>(
-    PrefillParams params, PrefillDTypeO* tmp_v, float* tmp_s, bool enable_pdl, cudaStream_t stream);
-
-// ── fp32 ↔ fp16 cast kernels ──
-
-__global__ void cast_f32_to_f16_kernel(const float* src, half* dst, size_t n) {
-    size_t i = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
-    if (i < n) dst[i] = __float2half(src[i]);
-}
-
-__global__ void cast_f16_to_f32_kernel(const half* src, float* dst, size_t n) {
-    size_t i = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
-    if (i < n) dst[i] = __half2float(src[i]);
-}
-
-extern "C" {
-
-int flashinfer_batch_decode_plan(
-    void* float_workspace, size_t float_ws_size,
-    void* int_workspace, size_t int_ws_size,
-    void* page_locked_int_workspace,
-    int32_t* indptr_h, int batch_size,
-    int num_qo_heads, int num_kv_heads, int page_size, int head_dim,
-    cudaStream_t stream,
-    int64_t* plan_info_out, int* plan_info_len_out)
-{
-    (void)head_dim; // fixed at compile time
-
-    DecodePlanInfo plan_info;
-    uint32_t group_size = num_qo_heads / num_kv_heads;
-
-    // We need to dispatch on GROUP_SIZE to get the right work estimation function
-    cudaError_t status = cudaSuccess;
-
-    // Use a lambda to dispatch on group size
-    auto do_plan = [&]<uint32_t GROUP_SIZE>() -> cudaError_t {
-        auto work_estimation_func =
-            BatchDecodeWithPagedKVCacheWorkEstimationDispatched<
-                GROUP_SIZE, HEAD_DIM, POS_ENCODING_MODE, Variant, DecodeParams>;
-        return DecodePlan<HEAD_DIM, POS_ENCODING_MODE, Variant, DecodeParams>(
-            float_workspace, float_ws_size,
-            int_workspace, page_locked_int_workspace,
-            int_ws_size, plan_info, indptr_h,
-            (uint32_t)batch_size, (uint32_t)num_qo_heads,
-            (uint32_t)page_size, /*enable_cuda_graph=*/false,
-            stream, work_estimation_func);
-    };
-
-    switch (group_size) {
-        case 1:  status = do_plan.operator()<1>();  break;
-        case 2:  status = do_plan.operator()<2>();  break;
-        case 4:  status = do_plan.operator()<4>();  break;
-        case 8:  status = do_plan.operator()<8>();  break;
-        default: return -1; // unsupported group size
-    }
-
-    if (status != cudaSuccess) return (int)status;
-
-    auto vec = plan_info.ToVector();
-    *plan_info_len_out = (int)vec.size();
-    std::memcpy(plan_info_out, vec.data(), vec.size() * sizeof(int64_t));
-    return 0;
-}
-
-int flashinfer_batch_decode_run(
-    void* float_workspace, size_t float_ws_size,
-    void* int_workspace,
-    int64_t* plan_info_vec, int plan_info_len,
-    float* q,
-    float* k_cache,
-    float* v_cache,
-    int32_t* kv_indptr,
-    int32_t* kv_indices,
-    int32_t* kv_last_page_len,
-    float* output,
-    int batch_size,
-    int num_qo_heads, int num_kv_heads, int page_size, int head_dim,
-    cudaStream_t stream)
-{
-    (void)head_dim; // fixed at compile time
-
-    DecodePlanInfo plan_info;
-    plan_info.FromVector(std::vector<int64_t>(plan_info_vec, plan_info_vec + plan_info_len));
-
-    // Construct paged_kv_t with NHD layout
-    paged_kv_t<DTypeKV, IdType> paged_kv(
-        (uint32_t)num_kv_heads,
-        (uint32_t)page_size,
-        HEAD_DIM,
-        (uint32_t)batch_size,
-        QKVLayout::kNHD,
-        k_cache,
-        v_cache,
-        kv_indices,
-        kv_indptr,
-        kv_last_page_len);
-
-    DecodeParams params;
-    params.q = q;
-    params.q_rope_offset = nullptr;
-    params.paged_kv = paged_kv;
-    params.o = output;
-    params.lse = nullptr;
-    params.maybe_alibi_slopes = nullptr;
-    params.padded_batch_size = plan_info.padded_batch_size;
-    params.num_qo_heads = (uint32_t)num_qo_heads;
-    // Q buffer is (batch, num_qo_heads * head_dim) flat — the graph's split_dims + transpose
-    // are stride tricks, no data movement. So the actual memory layout is (batch, heads, dim).
-    params.q_stride_n = num_qo_heads * HEAD_DIM;
-    params.q_stride_h = HEAD_DIM;
-    params.window_left = -1; // no sliding window
-    params.logits_soft_cap = 0.0f;
-    params.sm_scale = 1.0f / sqrtf((float)HEAD_DIM);
-    params.rope_rcp_scale = 1.0f;
-    params.rope_rcp_theta = 1.0f;
-
-    // Set plan info pointers
-    params.request_indices =
-        GetPtrFromBaseOffset<IdType>(int_workspace, plan_info.request_indices_offset);
-    params.kv_tile_indices =
-        GetPtrFromBaseOffset<IdType>(int_workspace, plan_info.kv_tile_indices_offset);
-    params.o_indptr =
-        GetPtrFromBaseOffset<IdType>(int_workspace, plan_info.o_indptr_offset);
-    params.kv_chunk_size_ptr =
-        GetPtrFromBaseOffset<IdType>(int_workspace, plan_info.kv_chunk_size_ptr_offset);
-    params.block_valid_mask = nullptr;
-    params.partition_kv = false;
-
-    DTypeO* tmp_v = nullptr;
-    float* tmp_s = nullptr;
-
-    if (plan_info.split_kv) {
-        tmp_v = GetPtrFromBaseOffset<DTypeO>(float_workspace, plan_info.v_offset);
-        tmp_s = GetPtrFromBaseOffset<float>(float_workspace, plan_info.s_offset);
-        if (plan_info.enable_cuda_graph) {
-            params.block_valid_mask =
-                GetPtrFromBaseOffset<bool>(int_workspace, plan_info.block_valid_mask_offset);
-        }
-    }
-
-    cudaError_t status =
-        flashinfer::BatchDecodeWithPagedKVCacheDispatched<HEAD_DIM, POS_ENCODING_MODE, Variant>(
-            params, tmp_v, tmp_s, /*enable_pdl=*/false, stream);
-
-    return (int)status;
-}
-
-// ═══════════════════════════════════════════════════════════
-// BatchPrefill (fp16/bf16 only — tensor core MMA requires 16-bit inputs)
-// ═══════════════════════════════════════════════════════════
-//
-// The prefill kernel templates are instantiated above for fp16. These C API
-// functions accept fp32 pointers (matching the current luminal pipeline) but
-// return -1 to indicate that fp32 prefill is not supported. When native fp16
-// support is added, these will accept fp16 pointers and call through to the
-// instantiated templates.
-
-int flashinfer_batch_prefill_plan(
-    void*, size_t, void*, size_t, void*,
-    int32_t*, int32_t*, int, int,
-    int, int, int, int, cudaStream_t,
-    int64_t*, int*)
-{
-    return -1; // fp32 not supported — requires fp16/bf16
-}
-
-int flashinfer_batch_prefill_run(
-    void*, size_t, void*,
-    int64_t*, int,
-    float*, float*, float*,
-    int32_t*, int32_t*, int32_t*, int32_t*,
-    float*, int, int, int, int, int, int, cudaStream_t)
-{
-    return -1; // fp32 not supported — requires fp16/bf16
-}
-
-} // extern "C"
-
-// ── Slot index extraction kernel (outside extern "C" for __global__) ──
-
-__global__ void extract_slot_indices_kernel(
-    const int32_t* flat_idx, int32_t* out, int c, int kv_dim) {
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i < c) out[i] = flat_idx[i * kv_dim] / kv_dim;
-}
-
-extern "C" void flashinfer_extract_slot_indices(
-    const int32_t* flat_idx, int32_t* out, int c, int kv_dim,
-    cudaStream_t stream) {
-    if (c == 0) return;
-    int threads = 256;
-    int blocks = (c + threads - 1) / threads;
-    extract_slot_indices_kernel<<<blocks, threads, 0, stream>>>(
-        flat_idx, out, c, kv_dim);
-}
-
-// ── Derive CSR indptr from attention mask ──
-// Mask is (s, c) f32. Entries > -1e9 are "valid" (0.0), rest are -inf.
-// Per-row count of valid entries = context length for that sequence.
-// Output: indptr[0..=s] with indptr[0]=0 and indptr[i+1] = indptr[i] + ctx_len[i].
-// Single thread is fine since s is tiny (batch_size during decode, typically 1-8).
-
-__global__ void derive_indptr_kernel(
-    const float* mask, int32_t* indptr, int s, int c) {
-    if (threadIdx.x != 0 || blockIdx.x != 0) return;
-    indptr[0] = 0;
-    for (int i = 0; i < s; i++) {
-        int count = 0;
-        for (int j = 0; j < c; j++) {
-            if (mask[i * c + j] > -1e9f) count++;
-        }
-        indptr[i + 1] = indptr[i] + count;
-    }
-}
-
-extern "C" void flashinfer_derive_indptr_from_mask(
-    const float* mask, int32_t* indptr, int s, int c,
-    cudaStream_t stream) {
-    if (s == 0) return;
-    derive_indptr_kernel<<<1, 1, 0, stream>>>(mask, indptr, s, c);
-}
-
-// ── Output transpose: (batch, heads, dim) → (heads, batch, dim) ──
-// FlashInfer writes output as (batch, heads, dim) but Luminal expects (heads, batch, dim).
-// For batch=1 these are identical; for batch>1 we need an explicit transpose.
-
-__global__ void transpose_bhd_to_hbd_kernel(
-    const float* src, float* dst, int batch, int heads, int dim) {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int total = batch * heads * dim;
-    if (idx >= total) return;
-
-    // Decompose linear index into (b, h, d) for src layout
-    int d = idx % dim;
-    int h = (idx / dim) % heads;
-    int b = idx / (heads * dim);
-
-    // Write to (h, b, d) layout in dst
-    dst[h * batch * dim + b * dim + d] = src[idx];
-}
-
-extern "C" void flashinfer_transpose_output(
-    const float* src, float* dst,
-    int batch, int heads, int dim,
-    cudaStream_t stream) {
-    int total = batch * heads * dim;
-    if (total == 0) return;
-    int threads = 256;
-    int blocks = (total + threads - 1) / threads;
-    transpose_bhd_to_hbd_kernel<<<blocks, threads, 0, stream>>>(
-        src, dst, batch, heads, dim);
-}
--- a/crates/luminal_cuda_lite/src/host/flashinfer/wrapper.h
+++ b/crates/luminal_cuda_lite/src/host/flashinfer/wrapper.h
@@ -1,93 +0,0 @@
-#pragma once
-
-#include <cuda_runtime.h>
-#include <stdint.h>
-#include <stddef.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Plan phase: CPU-side scheduling. Must call before each new batch config.
-// Returns 0 on success, non-zero on failure.
-int flashinfer_batch_decode_plan(
-    void* float_workspace, size_t float_ws_size,
-    void* int_workspace, size_t int_ws_size,
-    void* page_locked_int_workspace,
-    int32_t* indptr_h, int batch_size,
-    int num_qo_heads, int num_kv_heads, int page_size, int head_dim,
-    cudaStream_t stream,
-    int64_t* plan_info_out, int* plan_info_len_out);
-
-// Run phase: GPU kernel launch.
-// Returns 0 on success, non-zero on failure.
-int flashinfer_batch_decode_run(
-    void* float_workspace, size_t float_ws_size,
-    void* int_workspace,
-    int64_t* plan_info_vec, int plan_info_len,
-    float* q,                    // [batch_size, num_qo_heads, head_dim]
-    float* k_cache,              // [num_pages, page_size, num_kv_heads, head_dim] (NHD)
-    float* v_cache,              // same layout
-    int32_t* kv_indptr,          // [batch_size + 1]
-    int32_t* kv_indices,         // [total_pages]
-    int32_t* kv_last_page_len,   // [batch_size]
-    float* output,               // [batch_size, num_qo_heads, head_dim]
-    int batch_size,
-    int num_qo_heads, int num_kv_heads, int page_size, int head_dim,
-    cudaStream_t stream);
-
-// Extract slot indices from a flat gather index tensor.
-// flat_idx shape: (c, kv_dim) i32, out shape: (c,) i32.
-// out[i] = flat_idx[i * kv_dim] / kv_dim
-void flashinfer_extract_slot_indices(
-    const int32_t* flat_idx, int32_t* out, int c, int kv_dim,
-    cudaStream_t stream);
-
-// Derive CSR indptr from attention mask.
-// mask shape: (s, c) f32. Entries > -1e9 are valid.
-// indptr shape: (s + 1,) i32. indptr[0] = 0, indptr[i+1] = cumsum of valid counts.
-void flashinfer_derive_indptr_from_mask(
-    const float* mask, int32_t* indptr, int s, int c,
-    cudaStream_t stream);
-
-// Transpose output from (batch, heads, dim) to (heads, batch, dim).
-void flashinfer_transpose_output(
-    const float* src, float* dst,
-    int batch, int heads, int dim,
-    cudaStream_t stream);
-
-// ── BatchPrefill with Paged KV Cache ──
-
-// Plan phase for batch prefill.
-// Returns 0 on success, non-zero on failure.
-int flashinfer_batch_prefill_plan(
-    void* float_workspace, size_t float_ws_size,
-    void* int_workspace, size_t int_ws_size,
-    void* page_locked_int_workspace,
-    int32_t* qo_indptr_h, int32_t* kv_indptr_h,
-    int total_num_rows, int batch_size,
-    int num_qo_heads, int num_kv_heads, int page_size, int head_dim,
-    cudaStream_t stream,
-    int64_t* plan_info_out, int* plan_info_len_out);
-
-// Run phase for batch prefill.
-// Returns 0 on success, non-zero on failure.
-int flashinfer_batch_prefill_run(
-    void* float_workspace, size_t float_ws_size,
-    void* int_workspace,
-    int64_t* plan_info_vec, int plan_info_len,
-    float* q,                    // [total_num_rows, num_qo_heads, head_dim]
-    float* k_cache,              // [num_pages, page_size, num_kv_heads, head_dim] (NHD)
-    float* v_cache,              // same layout
-    int32_t* qo_indptr,         // [batch_size + 1] on GPU
-    int32_t* kv_indptr,         // [batch_size + 1] on GPU
-    int32_t* kv_indices,         // [total_pages]
-    int32_t* kv_last_page_len,   // [batch_size]
-    float* output,               // [total_num_rows, num_qo_heads, head_dim]
-    int total_num_rows, int batch_size,
-    int num_qo_heads, int num_kv_heads, int page_size, int head_dim,
-    cudaStream_t stream);
-
-#ifdef __cplusplus
-}
-#endif
--- a/crates/luminal_cuda_lite/src/host/mod.rs
+++ b/crates/luminal_cuda_lite/src/host/mod.rs
@@ -1,129 +1,17 @@
 use std::{fmt::Debug, sync::Arc};

-use crate::cudarc::driver::{CudaStream, DriverError, result};
+use crate::cudarc::driver::{CudaSlice, CudaStream};
 use luminal::{op::EgglogOp, prelude::*};
 mod cublas;
 mod cublaslt;
-pub mod flashinfer;
 pub mod moe;

 pub type Ops = (
    // cublas::CuBlasSgemmV2,
    cublaslt::CuBlasLt,
-    cublaslt::CuBlasLtScaled,
    moe::GLUMoE,
-    flashinfer::FlashInferAttention,
 );

-#[cfg(test)]
-pub(crate) type CublasLtTypeTuple = (
-    luminal::dtype::DType,
-    luminal::dtype::DType,
-    luminal::dtype::DType,
-    luminal::dtype::DType,
-    &'static str,
-    luminal::dtype::DType,
-);
-
-#[cfg(test)]
-pub(crate) fn cublaslt_type_tuple(op: &dyn HostOp) -> Option<CublasLtTypeTuple> {
-    op.as_any()
-        .downcast_ref::<cublaslt::CuBlasLt>()
-        .map(cublaslt::CuBlasLt::type_tuple)
-}
-
-#[cfg(test)]
-pub(crate) type CublasLtScaleValues = (f64, f64);
-
-#[cfg(test)]
-pub(crate) fn cublaslt_scale_values(op: &dyn HostOp) -> Option<CublasLtScaleValues> {
-    op.as_any()
-        .downcast_ref::<cublaslt::CuBlasLt>()
-        .map(cublaslt::CuBlasLt::scale_values)
-}
-
-#[cfg(test)]
-pub(crate) fn cublaslt_epilogue(op: &dyn HostOp) -> Option<&'static str> {
-    op.as_any()
-        .downcast_ref::<cublaslt::CuBlasLt>()
-        .map(cublaslt::CuBlasLt::epilogue)
-}
-
-#[cfg(test)]
-pub(crate) type CublasLtMatrixOrders = (&'static str, &'static str, &'static str, &'static str);
-
-#[cfg(test)]
-pub(crate) fn cublaslt_matrix_orders(op: &dyn HostOp) -> Option<CublasLtMatrixOrders> {
-    op.as_any()
-        .downcast_ref::<cublaslt::CuBlasLt>()
-        .map(cublaslt::CuBlasLt::matrix_orders)
-}
-
-#[cfg(test)]
-pub(crate) type CublasLtTransposeOps = (&'static str, &'static str);
-
-#[cfg(test)]
-pub(crate) fn cublaslt_transpose_ops(op: &dyn HostOp) -> Option<CublasLtTransposeOps> {
-    op.as_any()
-        .downcast_ref::<cublaslt::CuBlasLt>()
-        .map(cublaslt::CuBlasLt::transpose_ops)
-}
-
-#[cfg(test)]
-pub(crate) fn cublaslt_c_d_layouts_match(op: &dyn HostOp) -> Option<bool> {
-    op.as_any()
-        .downcast_ref::<cublaslt::CuBlasLt>()
-        .map(cublaslt::CuBlasLt::c_d_layouts_match)
-}
-
-#[cfg(test)]
-pub(crate) type CublasLtTensorScaleInputs = (bool, bool);
-
-#[cfg(test)]
-pub(crate) fn cublaslt_tensor_scale_inputs(op: &dyn HostOp) -> Option<CublasLtTensorScaleInputs> {
-    op.as_any()
-        .downcast_ref::<cublaslt::CuBlasLt>()
-        .map(cublaslt::CuBlasLt::tensor_scale_inputs)
-}
-
-/// Non-owning device buffer handle used by host operations.
-///
-/// Runtime-owned intermediates may be a whole `CudaSlice`, a subregion inside
-/// the reusable arena, or an external pointer. Host ops only need the pointer
-/// and the logical byte length.
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-pub struct DeviceBuffer {
-    ptr: u64,
-    len: usize,
-}
-
-impl DeviceBuffer {
-    pub fn new(ptr: u64, len: usize) -> Self {
-        Self { ptr, len }
-    }
-
-    pub fn ptr(self) -> u64 {
-        self.ptr
-    }
-
-    pub fn len(self) -> usize {
-        self.len
-    }
-
-    pub fn is_empty(self) -> bool {
-        self.len == 0
-    }
-
-    pub fn clone_dtoh(self, stream: &Arc<CudaStream>) -> Result<Vec<u8>, DriverError> {
-        let mut host = vec![0u8; self.len];
-        unsafe {
-            result::memcpy_dtoh_async(&mut host, self.ptr, stream.cu_stream())?;
-        }
-        stream.synchronize()?;
-        Ok(host)
-    }
-}
-
 /// Host operations that execute on the CPU but orchestrate GPU work.
 ///
 /// This includes operations like cuBLAS calls and CUDA graph executions.
@@ -141,7 +29,7 @@ pub trait HostOp: Debug + as_any::AsAny + EgglogOp {
        stream: &Arc<CudaStream>,
        self_node: NodeIndex,
        inputs: &[NodeIndex],
-        buffers: &FxHashMap<NodeIndex, DeviceBuffer>,
+        buffers: &FxHashMap<NodeIndex, &CudaSlice<u8>>,
        dyn_map: &FxHashMap<char, usize>,
    ) -> anyhow::Result<()>;

@@ -160,15 +48,6 @@ pub trait HostOp: Debug + as_any::AsAny + EgglogOp {
        vec![]
    }

-    /// Returns relative lifetimes for extra buffer nodes within this host op.
-    ///
-    /// The tuple is `(node, first_step, last_step)`, where steps are local to
-    /// this host op's execution. Returning `None` tells the runtime to treat
-    /// every extra buffer as live for the whole host op.
-    fn extra_buffer_lifetimes(&self) -> Option<Vec<(NodeIndex, usize, usize)>> {
-        None
-    }
-
    /// Returns buffer size requirements for extra nodes (node -> size in elements).
    ///
    /// Called during buffer allocation to ensure all required buffers exist.
--- a/crates/luminal_cuda_lite/src/host/moe/glumoe_rewrite.egg
+++ b/crates/luminal_cuda_lite/src/host/moe/glumoe_rewrite.egg
@@ -5,19 +5,12 @@
 ;   mode=1: Gemma-style GELU (gate * sigmoid(1.595769 * gate * (1 + 0.044715 * gate^2)))
 ;
 ; To keep matching fast, we stage through marker states:
-;   1) Shared expert index/gather markers
-;   2) Shared gate-up matmul marker
-;   3) Activation marker (separate swiglu / gemma_gelu paths)
-;   4) Down matmul marker (separate swiglu / gemma_gelu paths)
-;   5) Final GLUMoE fusion (separate swiglu / gemma_gelu rules)
+;   1) Shared gate-up matmul marker
+;   2) Activation marker (separate swiglu / gemma_gelu paths)
+;   3) Down matmul marker (separate swiglu / gemma_gelu paths)
+;   4) Final GLUMoE fusion (separate swiglu / gemma_gelu rules)

 (datatype*
-    (GLUMoEExpertIndexState
-        (MkGLUMoEExpertIndexState Expression Expression IR)
-    )
-    (GLUMoEExpertGatherState
-        (MkGLUMoEExpertGatherState Expression Expression IR IR)
-    )
    (GLUMoEGateUpState
        (MkGLUMoEGateUpState Expression Expression Expression IR IR IR)
    )
@@ -35,8 +28,6 @@
    )
 )

-(function glumoe_expert_index (IR) GLUMoEExpertIndexState :merge new)
-(function glumoe_expert_gather (IR) GLUMoEExpertGatherState :merge new)
 (function glumoe_gate_up (IR) GLUMoEGateUpState :merge new)
 (function glumoe_swiglu (IR) GLUMoESwiGLUState :merge new)
 (function glumoe_gemma_gelu (IR) GLUMoEGemmaGELUState :merge new)
@@ -45,38 +36,17 @@

 (rule
    (
-        (= ?iota_base (Op (Iota ?io ?iota_base_range) (INil)))
-        (= ?mul_base (Op (Mul ?mul_base_shape ?mul_base_a_stride ?mul_base_b_stride ?mul_base_out_stride) (ICons ?topk_idx (ICons ?iota_base (INil)))))
-        (= ?iota_within (Op (Iota (MIter) ?iota_within_range) (INil)))
-        (= ?add_idx (Op (Add ?add_shape ?add_a_stride ?add_b_stride ?add_out_stride) (ICons ?mul_base (ICons ?iota_within (INil)))))
-    )
-    (
-        (set (glumoe_expert_index ?add_idx)
-            (MkGLUMoEExpertIndexState ?io ?iota_within_range ?topk_idx))
-    )
-    :ruleset glumoe
-    :name "GLUMoE expert index marker"
-)
+        ; ===== Gate-up expert gather =====
+        (= ?gu_iota_base (Op (Iota ?gu_io ?gu_iota_base_range) (INil)))
+        (= ?gu_mul_base (Op (Mul ?gu_mul_base_shape ?gu_mul_base_a_stride ?gu_mul_base_b_stride ?gu_mul_base_out_stride) (ICons ?topk_idx (ICons ?gu_iota_base (INil)))))
+        (= ?gu_iota_within (Op (Iota (MIter) ?gu_iota_within_range) (INil)))
+        (= ?gu_add_idx (Op (Add ?gu_add_shape ?gu_add_a_stride ?gu_add_b_stride ?gu_add_out_stride) (ICons ?gu_mul_base (ICons ?gu_iota_within (INil)))))
+        (= ?gu_gathered (Op (Gather ?gu_gather_idx_shape ?gu_gather_idx_stride ?gu_gather_data_shape ?gu_gather_data_stride) (ICons ?gu_add_idx (ICons ?gate_up_w (INil)))))

-(rule
-    (
-        (= ?index_state (glumoe_expert_index ?idx))
-        (= ?index_state (MkGLUMoEExpertIndexState ?io ?within_range ?topk_idx))
-        (= ?gathered (Op (Gather ?gather_idx_shape ?gather_idx_stride ?gather_data_shape ?gather_data_stride) (ICons ?idx (ICons ?weights (INil)))))
-        (= ?f32 (Op (Cast ?f32_size (F32)) (ICons ?gathered (INil))))
-    )
-    (
-        (set (glumoe_expert_gather ?f32)
-            (MkGLUMoEExpertGatherState ?io ?within_range ?topk_idx ?weights))
-    )
-    :ruleset glumoe
-    :name "GLUMoE expert gather marker"
-)
+        ; ===== Cast BF16→F32 =====
+        (= ?gu_f32 (Op (Cast ?gu_f32_size (F32)) (ICons ?gu_gathered (INil))))

-(rule
-    (
-        (= ?gather_state (glumoe_expert_gather ?gu_f32))
-        (= ?gather_state (MkGLUMoEExpertGatherState ?gu_io ?gu_iota_within_range ?topk_idx ?gate_up_w))
+        ; ===== Gate-up batched matmul =====
        (= ?gu_matmul_mul (Op (Mul ?gu_matmul_mul_shape ?gu_matmul_a_stride ?gu_matmul_b_stride ?gu_matmul_mul_out_stride) (ICons ?x (ICons ?gu_f32 (INil)))))
        (= ?gu_matmul (Op (Sum ?gu_matmul_out_shape ?gu_matmul_k ?gu_matmul_in_stride ?gu_matmul_k_stride ?gu_matmul_out_stride) (ICons ?gu_matmul_mul (INil))))
    )
@@ -84,7 +54,6 @@
        (set (glumoe_gate_up ?gu_matmul)
            (MkGLUMoEGateUpState ?gu_io ?gu_matmul_k ?gu_iota_within_range ?x ?topk_idx ?gate_up_w))
    )
-    :ruleset glumoe
    :name "GLUMoE gate-up matmul marker"
 )

@@ -111,7 +80,6 @@
    (
        (set (glumoe_swiglu ?swiglu_out) (MkGLUMoESwiGLUState ?gate_up_state))
    )
-    :ruleset glumoe
    :name "GLUMoE swiglu marker"
 )

@@ -145,7 +113,6 @@
    (
        (set (glumoe_gemma_gelu ?gemma_out) (MkGLUMoEGemmaGELUState ?gate_up_state))
    )
-    :ruleset glumoe
    :name "GLUMoE gemma gelu marker"
 )

@@ -155,8 +122,12 @@
        (= ?swiglu_state (glumoe_swiglu ?swiglu_out))
        (= ?swiglu_state (MkGLUMoESwiGLUState ?gate_up_state))

-        (= ?gather_state (glumoe_expert_gather ?dn_f32))
-        (= ?gather_state (MkGLUMoEExpertGatherState ?dn_io ?dn_iota_within_range ?topk_idx ?down_w))
+        (= ?dn_iota_base (Op (Iota ?dn_io ?dn_iota_base_range) (INil)))
+        (= ?dn_mul_base (Op (Mul ?dn_mul_base_shape ?dn_mul_base_a_stride ?dn_mul_base_b_stride ?dn_mul_base_out_stride) (ICons ?topk_idx (ICons ?dn_iota_base (INil)))))
+        (= ?dn_iota_within (Op (Iota (MIter) ?dn_iota_within_range) (INil)))
+        (= ?dn_add_idx (Op (Add ?dn_add_shape ?dn_add_a_stride ?dn_add_b_stride ?dn_add_out_stride) (ICons ?dn_mul_base (ICons ?dn_iota_within (INil)))))
+        (= ?dn_gathered (Op (Gather ?dn_gather_idx_shape ?dn_gather_idx_stride ?dn_gather_data_shape ?dn_gather_data_stride) (ICons ?dn_add_idx (ICons ?down_w (INil)))))
+        (= ?dn_f32 (Op (Cast ?dn_f32_size (F32)) (ICons ?dn_gathered (INil))))
        (= ?dn_matmul_mul (Op (Mul ?dn_matmul_mul_shape ?dn_matmul_a_stride ?dn_matmul_b_stride ?dn_matmul_mul_out_stride) (ICons ?swiglu_out (ICons ?dn_f32 (INil)))))
        (= ?dn_matmul (Op (Sum ?dn_matmul_out_shape ?dn_matmul_k ?dn_matmul_in_stride ?dn_matmul_k_stride ?dn_matmul_out_stride) (ICons ?dn_matmul_mul (INil))))
    )
@@ -164,7 +135,6 @@
        (set (glumoe_swiglu_down ?dn_matmul)
            (MkGLUMoESwiGLUDownState ?dn_io ?dn_matmul_k ?dn_iota_within_range ?swiglu_state ?topk_idx ?down_w))
    )
-    :ruleset glumoe
    :name "GLUMoE swiglu down marker"
 )

@@ -174,8 +144,12 @@
        (= ?gemma_state (glumoe_gemma_gelu ?gemma_out))
        (= ?gemma_state (MkGLUMoEGemmaGELUState ?gate_up_state))

-        (= ?gather_state (glumoe_expert_gather ?dn_f32))
-        (= ?gather_state (MkGLUMoEExpertGatherState ?dn_io ?dn_iota_within_range ?topk_idx ?down_w))
+        (= ?dn_iota_base (Op (Iota ?dn_io ?dn_iota_base_range) (INil)))
+        (= ?dn_mul_base (Op (Mul ?dn_mul_base_shape ?dn_mul_base_a_stride ?dn_mul_base_b_stride ?dn_mul_base_out_stride) (ICons ?topk_idx (ICons ?dn_iota_base (INil)))))
+        (= ?dn_iota_within (Op (Iota (MIter) ?dn_iota_within_range) (INil)))
+        (= ?dn_add_idx (Op (Add ?dn_add_shape ?dn_add_a_stride ?dn_add_b_stride ?dn_add_out_stride) (ICons ?dn_mul_base (ICons ?dn_iota_within (INil)))))
+        (= ?dn_gathered (Op (Gather ?dn_gather_idx_shape ?dn_gather_idx_stride ?dn_gather_data_shape ?dn_gather_data_stride) (ICons ?dn_add_idx (ICons ?down_w (INil)))))
+        (= ?dn_f32 (Op (Cast ?dn_f32_size (F32)) (ICons ?dn_gathered (INil))))
        (= ?dn_matmul_mul (Op (Mul ?dn_matmul_mul_shape ?dn_matmul_a_stride ?dn_matmul_b_stride ?dn_matmul_mul_out_stride) (ICons ?gemma_out (ICons ?dn_f32 (INil)))))
        (= ?dn_matmul (Op (Sum ?dn_matmul_out_shape ?dn_matmul_k ?dn_matmul_in_stride ?dn_matmul_k_stride ?dn_matmul_out_stride) (ICons ?dn_matmul_mul (INil))))
    )
@@ -183,7 +157,6 @@
        (set (glumoe_gemma_down ?dn_matmul)
            (MkGLUMoEGemmaDownState ?dn_io ?dn_matmul_k ?dn_iota_within_range ?gemma_state ?topk_idx ?down_w))
    )
-    :ruleset glumoe
    :name "GLUMoE gemma down marker"
 )

@@ -195,10 +168,6 @@
        (= ?swiglu_state (MkGLUMoESwiGLUState ?gate_up_state))
        (= ?gate_up_state (MkGLUMoEGateUpState ?gu_io ?gu_matmul_k ?gu_within_range ?x ?topk_idx ?gate_up_w))

-        (= ?topk_row_offsets (Op (Iota ?topk_row_offsets_expr ?topk_row_offsets_range) (INil)))
-        (= ?topk_flat_idx (Op (Add ?topk_flat_idx_shape ?topk_flat_idx_a_stride ?topk_flat_idx_b_stride ?topk_flat_idx_out_stride) (ICons ?topk_row_offsets (ICons ?topk_idx (INil)))))
-        (= ?topk_vals (Op (Gather ?topk_vals_gather_idx_shape ?topk_vals_gather_idx_stride ?topk_vals_gather_data_shape ?topk_vals_gather_data_stride) (ICons ?topk_flat_idx (ICons ?routing_weights (INil)))))
-
        (= ?weighted (Op (Mul ?weighted_shape ?weighted_a_stride ?weighted_b_stride ?weighted_out_stride) (ICons ?dn_matmul (ICons ?topk_vals (INil)))))
        (= ?output (Op (Sum ?output_shape ?output_k ?output_in_stride ?output_k_stride ?output_out_stride) (ICons ?weighted (INil))))
    )
@@ -208,44 +177,10 @@
            ?gu_within_range ?dn_within_range (MNum 0))
            (ICons ?x (ICons ?topk_idx (ICons ?topk_vals (ICons ?gate_up_w (ICons ?down_w (ICons ?topk_vals (INil)))))))))
        (union ?output ?glumoe)
-        (subsume (Op (Sum ?output_shape ?output_k ?output_in_stride ?output_k_stride ?output_out_stride) (ICons ?weighted (INil))))
-        (subsume (Op (KernelSum ?output_shape ?output_k ?output_in_stride ?output_k_stride ?output_out_stride (F32)) (ICons ?weighted (INil))))
    )
-    :ruleset glumoe
    :name "GLUMoE fused expert computation (swiglu)"
 )

-; ===== Final fusion: mode 2 (SwiGLU with row-normalized top-k weights) =====
-(rule
-    (
-        (= ?down_state (glumoe_swiglu_down ?dn_matmul))
-        (= ?down_state (MkGLUMoESwiGLUDownState ?dn_io ?dn_matmul_k ?dn_within_range ?swiglu_state ?topk_idx ?down_w))
-        (= ?swiglu_state (MkGLUMoESwiGLUState ?gate_up_state))
-        (= ?gate_up_state (MkGLUMoEGateUpState ?gu_io ?gu_matmul_k ?gu_within_range ?x ?topk_idx ?gate_up_w))
-
-        (= ?topk_row_offsets (Op (Iota ?topk_row_offsets_expr ?topk_row_offsets_range) (INil)))
-        (= ?topk_flat_idx (Op (Add ?topk_flat_idx_shape ?topk_flat_idx_a_stride ?topk_flat_idx_b_stride ?topk_flat_idx_out_stride) (ICons ?topk_row_offsets (ICons ?topk_idx (INil)))))
-        (= ?topk_vals (Op (Gather ?topk_vals_gather_idx_shape ?topk_vals_gather_idx_stride ?topk_vals_gather_data_shape ?topk_vals_gather_data_stride) (ICons ?topk_flat_idx (ICons ?routing_weights (INil)))))
-        (= ?topk_norm (Op (Sum ?topk_norm_shape ?output_k ?topk_norm_in_stride ?topk_norm_k_stride ?topk_norm_out_stride) (ICons ?topk_vals (INil))))
-        (= ?topk_norm_factor (Op (Recip ?topk_norm_recip_shape ?topk_norm_recip_in_stride ?topk_norm_recip_out_stride) (ICons ?topk_norm (INil))))
-        (= ?normed_topk (Op (Mul ?normed_topk_shape ?normed_topk_a_stride ?normed_topk_b_stride ?normed_topk_out_stride) (ICons ?topk_vals (ICons ?topk_norm_factor (INil)))))
-
-        (= ?weighted (Op (Mul ?weighted_shape ?weighted_a_stride ?weighted_b_stride ?weighted_out_stride) (ICons ?dn_matmul (ICons ?normed_topk (INil)))))
-        (= ?output (Op (Sum ?output_shape ?output_k ?output_in_stride ?output_k_stride ?output_out_stride) (ICons ?weighted (INil))))
-    )
-    (
-        (let ?glumoe (Op (GLUMoE
-            ?gu_io ?dn_io ?gu_matmul_k ?dn_matmul_k ?output_k
-            ?gu_within_range ?dn_within_range (MNum 2))
-            (ICons ?x (ICons ?topk_idx (ICons ?topk_vals (ICons ?gate_up_w (ICons ?down_w (ICons ?topk_vals (INil)))))))))
-        (union ?output ?glumoe)
-        (subsume (Op (Sum ?output_shape ?output_k ?output_in_stride ?output_k_stride ?output_out_stride) (ICons ?weighted (INil))))
-        (subsume (Op (KernelSum ?output_shape ?output_k ?output_in_stride ?output_k_stride ?output_out_stride (F32)) (ICons ?weighted (INil))))
-    )
-    :ruleset glumoe
-    :name "GLUMoE fused expert computation (normalized swiglu)"
-)
-
 ; ===== Final fusion: mode 1 (Gemma GELU) =====
 (rule
    (
@@ -273,9 +208,6 @@
            ?gu_within_range ?dn_within_range (MNum 1))
            (ICons ?x (ICons ?topk_idx (ICons ?topk_vals (ICons ?gate_up_w (ICons ?down_w (ICons ?per_expert_scale (INil)))))))))
        (union ?output ?glumoe)
-        (subsume (Op (Sum ?output_shape ?output_k ?output_in_stride ?output_k_stride ?output_out_stride) (ICons ?weighted (INil))))
-        (subsume (Op (KernelSum ?output_shape ?output_k ?output_in_stride ?output_k_stride ?output_out_stride (F32)) (ICons ?weighted (INil))))
    )
-    :ruleset glumoe
    :name "GLUMoE fused expert computation (gemma_gelu)"
 )
--- a/crates/luminal_cuda_lite/src/host/moe/mod.rs
+++ b/crates/luminal_cuda_lite/src/host/moe/mod.rs
@@ -32,7 +32,7 @@ use crate::{
            CudaFunction, CudaModule, CudaSlice, CudaStream, DevicePtr, LaunchConfig, PushKernelArg,
        },
    },
-    host::{DeviceBuffer, HostOp},
+    host::HostOp,
    try_create_cublaslt,
 };

@@ -50,7 +50,7 @@ const WORKSPACE_SIZE: usize = 32 * 1024 * 1024; // 32 MiB
 ///   3: gate_up_w      [E, gate_up_dim, hidden]             BF16
 ///   4: down_w         [E, hidden, intermediate]             BF16
 ///   5: mode_aux
-///      - SwiGLU/SwiGLUNormalized: ignored (rewriter wires `topk_values` again)
+///      - SwiGLU: ignored (rewriter wires `topk_values` again)
 ///      - GemmaGELU: per_expert_scale [E]                   F32
 ///
 /// Output: [seq, hidden] F32
@@ -78,7 +78,6 @@ pub struct GLUMoE {
 pub(crate) enum GLUMoEMode {
    SwiGLU,
    GemmaGELU,
-    SwiGLUNormalized,
 }

 impl GLUMoEMode {
@@ -86,7 +85,6 @@ impl GLUMoEMode {
        match mode_id {
            0 => Self::SwiGLU,
            1 => Self::GemmaGELU,
-            2 => Self::SwiGLUNormalized,
            other => {
                panic!("Unknown GLUMoE mode id: {other}");
            }
@@ -95,7 +93,7 @@ impl GLUMoEMode {

    fn activation_kernel_mode(self) -> i32 {
        match self {
-            Self::SwiGLU | Self::SwiGLUNormalized => 0,
+            Self::SwiGLU => 0,
            Self::GemmaGELU => 1,
        }
    }
@@ -226,9 +224,8 @@ impl EgglogOp for GLUMoE {
    }

    fn rewrites(&self) -> Vec<Rule> {
-        vec![
-            Rule::raw(
-                "(rule
+        vec![Rule::raw(
+            "(rule
                (
                    (= ?e (Op (GLUMoE ?gu_io ?dn_io ?gu_matmul_k ?dn_matmul_k ?output_k ?gu_within_range ?dn_within_range ?mode) ?inputs))
                )
@@ -237,15 +234,17 @@ impl EgglogOp for GLUMoE {
                )
                :ruleset dtype_prop
            )",
-            ),
-            Rule::raw(include_str!["glumoe_rewrite.egg"]),
-        ]
+        )]
    }

    fn n_inputs(&self) -> usize {
        6
    }

+    fn early_rewrites(&self) -> Vec<Rule> {
+        vec![Rule::raw(include_str!["glumoe_rewrite.egg"])]
+    }
+
    fn extract<'a>(
        &'a self,
        egraph: &'a luminal::egglog_utils::SerializedEGraph,
@@ -296,140 +295,27 @@ impl HostOp for GLUMoE {
        stream: &Arc<CudaStream>,
        self_node: NodeIndex,
        inputs: &[NodeIndex],
-        buffers: &FxHashMap<NodeIndex, DeviceBuffer>,
+        buffers: &FxHashMap<NodeIndex, &CudaSlice<u8>>,
        dyn_map: &FxHashMap<char, usize>,
    ) -> anyhow::Result<()> {
-        if inputs.len() < 6 {
-            anyhow::bail!("GLUMoE expected at least 6 inputs, got {}", inputs.len());
-        }
-
        // Resolve dimensions
-        let hidden = self
-            .gu_matmul_k
-            .exec(dyn_map)
-            .ok_or_else(|| anyhow::anyhow!("GLUMoE hidden dimension is unresolved"))?;
-        let intermediate = self
-            .dn_matmul_k
-            .exec(dyn_map)
-            .ok_or_else(|| anyhow::anyhow!("GLUMoE intermediate dimension is unresolved"))?;
-        let top_k = self
-            .output_k
-            .exec(dyn_map)
-            .ok_or_else(|| anyhow::anyhow!("GLUMoE top-k dimension is unresolved"))?;
-        let gu_io = self
-            .gu_io
-            .exec(dyn_map)
-            .ok_or_else(|| anyhow::anyhow!("GLUMoE gate/up stride is unresolved"))?;
-        let dn_io = self
-            .dn_io
-            .exec(dyn_map)
-            .ok_or_else(|| anyhow::anyhow!("GLUMoE down stride is unresolved"))?;
+        let hidden = self.gu_matmul_k.exec(dyn_map).unwrap();
+        let intermediate = self.dn_matmul_k.exec(dyn_map).unwrap();
+        let top_k_expected = self.output_k.exec(dyn_map).unwrap();
+        let gate_up_dim = self.gu_io.exec(dyn_map).unwrap() / hidden; // gate_up_dim = gu_io / hidden
+        let num_experts = self.gu_within_range.exec(dyn_map).unwrap() / (gate_up_dim * hidden);

-        if hidden == 0 || intermediate == 0 {
-            anyhow::bail!(
-                "GLUMoE got zero-sized matmul dimensions: hidden={hidden}, intermediate={intermediate}"
-            );
-        }
-        if top_k == 0 {
-            return Ok(());
-        }
-        if gu_io % hidden != 0 {
-            anyhow::bail!("GLUMoE gate/up stride {gu_io} is not divisible by hidden {hidden}");
-        }
-        if dn_io % intermediate != 0 {
-            anyhow::bail!(
-                "GLUMoE down stride {dn_io} is not divisible by intermediate {intermediate}"
-            );
-        }
-
-        let gate_up_dim = gu_io / hidden; // gate_up_dim = 2 * intermediate for GLU
-        let down_hidden = dn_io / intermediate;
-        if gate_up_dim != intermediate * 2 {
-            anyhow::bail!(
-                "GLUMoE expected gate/up dim {} to equal 2 * intermediate {}",
-                gate_up_dim,
-                intermediate * 2
-            );
-        }
-        if down_hidden != hidden {
-            anyhow::bail!("GLUMoE down hidden {down_hidden} does not match hidden {hidden}");
-        }
-
-        let output_bytes = self
-            .output_bytes()
-            .exec(dyn_map)
-            .ok_or_else(|| anyhow::anyhow!("GLUMoE output byte size is unresolved"))?;
-        if output_bytes % (hidden * 4) != 0 {
-            anyhow::bail!(
-                "GLUMoE output bytes {output_bytes} are not divisible by hidden bytes {}",
-                hidden * 4
-            );
-        }
-        let seq = output_bytes / (hidden * 4);
-        if seq == 0 {
-            return Ok(());
-        }
-
-        let get_buffer = |name: &str, node: NodeIndex| -> anyhow::Result<DeviceBuffer> {
-            buffers.get(&node).copied().ok_or_else(|| {
-                anyhow::anyhow!("GLUMoE missing {name} buffer for LLIR node {node:?}")
-            })
-        };
+        // Derive seq from x buffer size: x is [seq, hidden] F32 → seq = len / (hidden * 4)
+        let x_buf = buffers[&inputs[0]];
+        let seq = x_buf.len() / (hidden * 4);

        // Get input/output buffers
-        let x_buf = get_buffer("x", inputs[0])?; // [seq, hidden] F32
-        let topk_idx_buf = get_buffer("topk indices", inputs[1])?; // [seq, k] Int
-        let topk_vals_buf = get_buffer("topk values", inputs[2])?; // [seq, k] F32
-        let gate_up_buf = get_buffer("gate/up weights", inputs[3])?; // [E, gate_up_dim, hidden] BF16
-        let down_buf = get_buffer("down weights", inputs[4])?; // [E, hidden, intermediate] BF16
-        let mode_aux_buf = get_buffer("mode aux", inputs[5])?;
-        let output_buf = get_buffer("output", self_node)?; // [seq, hidden] F32
-
-        let min_topk_bytes = seq * top_k * 4;
-        if x_buf.len() < output_bytes {
-            anyhow::bail!(
-                "GLUMoE x buffer too small: have {} bytes, need {output_bytes}",
-                x_buf.len()
-            );
-        }
-        if topk_idx_buf.len() < min_topk_bytes {
-            anyhow::bail!(
-                "GLUMoE topk index buffer too small: have {} bytes, need {min_topk_bytes}",
-                topk_idx_buf.len()
-            );
-        }
-        if topk_vals_buf.len() < min_topk_bytes {
-            anyhow::bail!(
-                "GLUMoE topk value buffer too small: have {} bytes, need {min_topk_bytes}",
-                topk_vals_buf.len()
-            );
-        }
-        if output_buf.len() < output_bytes {
-            anyhow::bail!(
-                "GLUMoE output buffer too small: have {} bytes, need {output_bytes}",
-                output_buf.len()
-            );
-        }
-
-        let gu_stride_bytes = gate_up_dim * hidden * 2;
-        let down_stride_bytes = hidden * intermediate * 2;
-        if gu_stride_bytes == 0 || gate_up_buf.len() % gu_stride_bytes != 0 {
-            anyhow::bail!(
-                "GLUMoE gate/up weight buffer has {} bytes, not a multiple of per-expert stride {gu_stride_bytes}",
-                gate_up_buf.len()
-            );
-        }
-        let num_experts = gate_up_buf.len() / gu_stride_bytes;
-        if num_experts == 0 {
-            anyhow::bail!("GLUMoE has no expert weights");
-        }
-        if down_buf.len() < num_experts * down_stride_bytes {
-            anyhow::bail!(
-                "GLUMoE down weight buffer too small: have {} bytes, need {}",
-                down_buf.len(),
-                num_experts * down_stride_bytes
-            );
-        }
+        let topk_idx_buf = buffers[&inputs[1]]; // [seq, k] Int
+        let topk_vals_buf = buffers[&inputs[2]]; // [seq, k] F32
+        let gate_up_buf = buffers[&inputs[3]]; // [E, gate_up_dim, hidden] BF16
+        let down_buf = buffers[&inputs[4]]; // [E, hidden, intermediate] BF16
+        let mode_aux_buf = buffers[&inputs[5]];
+        let output_buf = buffers[&self_node]; // [seq, hidden] F32

        // Get raw device pointer addresses
        let x_ptr = buf_ptr(x_buf, stream);
@@ -441,101 +327,41 @@ impl HostOp for GLUMoE {
        let (_, f32_to_bf16_fn, activation_fn) = self.get_kernels(stream);

        // Read top-k routing values from GPU
-        let topk_idx_host: Vec<u8> = topk_idx_buf.clone_dtoh(stream)?;
+        let topk_idx_host: Vec<u8> = stream.clone_dtoh(topk_idx_buf)?;
        let topk_idx_i32: &[i32] = bytemuck::cast_slice(&topk_idx_host);
-        let topk_vals_host: Vec<u8> = topk_vals_buf.clone_dtoh(stream)?;
+        let topk_vals_host: Vec<u8> = stream.clone_dtoh(topk_vals_buf)?;
        let topk_vals_f32: &[f32] = bytemuck::cast_slice(&topk_vals_host);
-
-        if !topk_idx_i32.len().is_multiple_of(seq) {
-            anyhow::bail!(
-                "GLUMoE topk index element count {} is not divisible by seq {seq}",
-                topk_idx_i32.len()
-            );
-        }
-        if !topk_vals_f32.len().is_multiple_of(seq) {
-            anyhow::bail!(
-                "GLUMoE topk value element count {} is not divisible by seq {seq}",
-                topk_vals_f32.len()
-            );
-        }
-        let topk_idx_row_stride = topk_idx_i32.len() / seq;
-        let topk_vals_row_stride = topk_vals_f32.len() / seq;
-        if topk_idx_row_stride < top_k {
-            anyhow::bail!(
-                "GLUMoE topk index row stride {topk_idx_row_stride} is smaller than top_k {top_k}"
-            );
-        }
-        if topk_vals_row_stride < top_k {
-            anyhow::bail!(
-                "GLUMoE topk value row stride {topk_vals_row_stride} is smaller than top_k {top_k}"
-            );
-        }
-
-        let topk_idx_at = |token: usize, expert: usize| -> i32 {
-            topk_idx_i32[token * topk_idx_row_stride + expert]
-        };
-        let topk_val_at = |token: usize, expert: usize| -> f32 {
-            topk_vals_f32[token * topk_vals_row_stride + expert]
-        };
-
-        for t in 0..seq {
-            for i in 0..top_k {
-                let expert_idx = topk_idx_at(t, i);
-                if expert_idx < 0 || expert_idx as usize >= num_experts {
-                    anyhow::bail!(
-                        "GLUMoE expert index {expert_idx} at token {t} top-k position {i} out of bounds for {num_experts} experts"
-                    );
-                }
-            }
+        let idx_k = topk_idx_i32
+            .len()
+            .checked_div(seq)
+            .unwrap_or(top_k_expected);
+        let val_k = topk_vals_f32
+            .len()
+            .checked_div(seq)
+            .unwrap_or(top_k_expected);
+        let top_k = idx_k.min(val_k);
+        if seq > 0 && top_k == 0 {
+            return Ok(());
        }

        // Mode-dependent expert weights used for the final reduction:
        // - SwiGLU: direct topk values
-        // - SwiGLUNormalized: normalize topk values row-wise
        // - GemmaGELU: normalize topk values and scale by per-expert factors
        let mut expert_weights_storage: Vec<f32> = Vec::new();
        let expert_weights_f32: &[f32] = match self.mode {
-            GLUMoEMode::SwiGLU => {
-                if topk_vals_row_stride == top_k {
-                    topk_vals_f32
-                } else {
-                    expert_weights_storage.resize(seq * top_k, 0.0);
-                    for t in 0..seq {
-                        for i in 0..top_k {
-                            expert_weights_storage[t * top_k + i] = topk_val_at(t, i);
-                        }
-                    }
-                    &expert_weights_storage
-                }
-            }
-            GLUMoEMode::SwiGLUNormalized => {
-                expert_weights_storage.resize(seq * top_k, 0.0);
-                for t in 0..seq {
-                    let norm = (0..top_k).map(|i| topk_val_at(t, i)).sum::<f32>();
-                    let inv_norm = if norm != 0.0 { norm.recip() } else { 0.0 };
-                    for i in 0..top_k {
-                        expert_weights_storage[t * top_k + i] = topk_val_at(t, i) * inv_norm;
-                    }
-                }
-                &expert_weights_storage
-            }
+            GLUMoEMode::SwiGLU => topk_vals_f32,
            GLUMoEMode::GemmaGELU => {
-                let per_expert_scale_host: Vec<u8> = mode_aux_buf.clone_dtoh(stream)?;
-                let per_expert_scale_bytes = num_experts * 4;
-                if per_expert_scale_host.len() < per_expert_scale_bytes {
-                    anyhow::bail!(
-                        "GLUMoE per-expert scale buffer too small: have {} bytes, need {per_expert_scale_bytes}",
-                        per_expert_scale_host.len()
-                    );
-                }
-                let per_expert_scale_f32: &[f32] =
-                    bytemuck::cast_slice(&per_expert_scale_host[..per_expert_scale_bytes]);
+                let per_expert_scale_host: Vec<u8> = stream.clone_dtoh(mode_aux_buf)?;
+                let per_expert_scale_f32: &[f32] = bytemuck::cast_slice(&per_expert_scale_host);
+                debug_assert!(per_expert_scale_f32.len() >= num_experts);
                expert_weights_storage.resize(seq * top_k, 0.0);
                for t in 0..seq {
-                    let norm = (0..top_k).map(|i| topk_val_at(t, i)).sum::<f32>();
+                    let base = t * top_k;
+                    let vals = &topk_vals_f32[base..base + top_k];
+                    let norm = vals.iter().copied().sum::<f32>();
                    let inv_norm = if norm != 0.0 { norm.recip() } else { 0.0 };
                    for i in 0..top_k {
-                        let expert_idx = topk_idx_at(t, i) as usize;
+                        let expert_idx = topk_idx_i32[base + i] as usize;
                        if expert_idx >= per_expert_scale_f32.len() {
                            anyhow::bail!(
                                "GLUMoE Gemma mode expert index {} out of bounds {}",
@@ -544,8 +370,7 @@ impl HostOp for GLUMoE {
                            );
                        }
                        let scale = per_expert_scale_f32[expert_idx];
-                        expert_weights_storage[t * top_k + i] =
-                            topk_val_at(t, i) * inv_norm * scale;
+                        expert_weights_storage[base + i] = vals[i] * inv_norm * scale;
                    }
                }
                &expert_weights_storage
@@ -558,10 +383,10 @@ impl HostOp for GLUMoE {
        let hidden_tmp = unsafe { stream.alloc::<u8>(intermediate * 2)? }; // BF16
        let workspace = unsafe { stream.alloc::<u8>(WORKSPACE_SIZE)? };

-        let xbf16_ptr = slice_ptr(&x_bf16_buf, stream);
-        let gu_out_ptr = slice_ptr(&gate_up_out_buf, stream);
-        let hid_ptr = slice_ptr(&hidden_tmp, stream);
-        let ws_ptr = slice_ptr(&workspace, stream);
+        let xbf16_ptr = buf_ptr(&x_bf16_buf, stream);
+        let gu_out_ptr = buf_ptr(&gate_up_out_buf, stream);
+        let hid_ptr = buf_ptr(&hidden_tmp, stream);
+        let ws_ptr = buf_ptr(&workspace, stream);

        // Cast x F32 → BF16
        let n_cast = (seq * hidden) as i32;
@@ -580,15 +405,17 @@ impl HostOp for GLUMoE {
        }

        // Per-token expert computation
-        let gu_stride = gu_stride_bytes as u64; // bytes per expert gate_up (BF16)
-        let down_stride = down_stride_bytes as u64; // bytes per expert down (BF16)
+        let gu_stride = (gate_up_dim * hidden * 2) as u64; // bytes per expert gate_up (BF16)
+        let down_stride = (hidden * intermediate * 2) as u64; // bytes per expert down (BF16)

        for t in 0..seq {
            let x_t_ptr = xbf16_ptr + (t * hidden * 2) as u64; // BF16
+            let expert_indices = &topk_idx_i32[t * top_k..(t + 1) * top_k];
            let weights = &expert_weights_f32[t * top_k..(t + 1) * top_k];

-            for (i, &weight) in weights.iter().enumerate() {
-                let expert_idx = topk_idx_at(t, i) as usize;
+            for (i, (&expert_idx, &weight)) in expert_indices.iter().zip(weights.iter()).enumerate()
+            {
+                let expert_idx = expert_idx as usize;

                // a. Gate+Up matmul (BF16 in, BF16 out)
                let expert_gu_ptr = gate_up_ptr + expert_idx as u64 * gu_stride;
@@ -681,11 +508,7 @@ impl HostOp for GLUMoE {
 // Helpers
 // ============================================================

-fn buf_ptr(buf: DeviceBuffer, _stream: &Arc<CudaStream>) -> u64 {
-    buf.ptr()
-}
-
-fn slice_ptr(buf: &CudaSlice<u8>, stream: &Arc<CudaStream>) -> u64 {
+fn buf_ptr(buf: &CudaSlice<u8>, stream: &Arc<CudaStream>) -> u64 {
    let (ptr, _guard) = buf.device_ptr(stream);
    ptr
 }
--- a/crates/luminal_cuda_lite/src/kernel/conv2d.rs
+++ b/crates/luminal_cuda_lite/src/kernel/conv2d.rs
@@ -1,289 +0,0 @@
-//! Direct conv2d_bias kernel — fuses unfold + matmul + bias into one
-//! CUDA kernel with no `(H_out*W_out, C_in*K*K)` intermediate matrix.
-//!
-//! This is exposed as a luminal `CustomOp`, not a standard egglog-rewritten
-//! `KernelOp`, because the conv has no useful fusion opportunities with
-//! surrounding ops in the graphs it's used in (the VAE's resnet blocks),
-//! and pattern-matching the unfold+permute+merge_dims+matmul+bias chain
-//! reliably from egglog is significantly more work than just bypassing
-//! the egglog rewrite path entirely.
-//!
-//! The kernel is one-thread-per-output: each thread computes
-//!   `out[co, ho, wo] = bias[co] + sum_{ci,ki,kj} input[ci, ho*S+ki-P, wo*S+kj-P] * weight[co, ci, ki, kj]`
-//! with bounds checks on the spatial dims for padding. This is far from
-//! peak FLOPs (no shared-memory tiling, no warp-level reduction over K)
-//! but it's correct and the memory footprint is just the input + weight +
-//! bias + output buffers — no `(M, K)` or `(M, N, K)` intermediate, so it
-//! scales linearly with the actual conv FLOPs rather than blowing up at
-//! large H/W like the unfold-based formulation.
-
-use std::sync::Arc;
-
-use cudarc::driver::{CudaFunction, CudaModule, CudaSlice, CudaStream};
-use luminal::prelude::FxHashMap;
-use luminal::{
-    dtype::DType, graph::Graph, op::CustomOp, op::LLIROp, prelude::GraphTensor, shape::Expression,
-};
-
-use crate::compile_module_image_for_current_device;
-use crate::kernel::KernelOp;
-
-/// Direct conv2d-with-bias kernel. All shape/kernel params are static
-/// (baked into the CUDA source via #defines), so each conv shape gets
-/// its own compiled kernel. Inputs (in order): input `(C_in, H_in, W_in)`,
-/// weight `(C_out, C_in*K*K)` (i.e. flattened `(C_out, C_in, K, K)`), bias
-/// `(C_out,)`. Output: `(C_out, H_out, W_out)`.
-#[derive(Debug, Clone)]
-pub struct Conv2DKernel {
-    pub c_in: usize,
-    pub h_in: usize,
-    pub w_in: usize,
-    pub c_out: usize,
-    pub kernel: usize,
-    pub stride: usize,
-    pub padding: usize,
-    pub h_out: usize,
-    pub w_out: usize,
-}
-
-impl Conv2DKernel {
-    fn output_elements(&self) -> usize {
-        self.c_out * self.h_out * self.w_out
-    }
-}
-
-const THREADS_PER_BLOCK: usize = 256;
-
-impl KernelOp for Conv2DKernel {
-    fn compile(
-        &self,
-        stream: &Arc<CudaStream>,
-        compile_cache: &mut FxHashMap<String, (Arc<CudaModule>, CudaFunction)>,
-    ) -> (
-        CudaFunction,
-        Arc<CudaModule>,
-        String,
-        (Expression, Expression, Expression),
-        (Expression, Expression, Expression),
-        Expression,
-        FxHashMap<char, CudaSlice<u8>>,
-    ) {
-        let total = self.output_elements();
-        let grid = total.div_ceil(THREADS_PER_BLOCK);
-
-        let kernel = format!(
-            "
-extern \"C\" __global__ void conv2d_bias_kernel(
-    float* __restrict__ out,
-    const float* __restrict__ input,
-    const float* __restrict__ weight,
-    const float* __restrict__ bias
-) {{
-    const int TOTAL = {total};
-    const int CIN  = {c_in};
-    const int H    = {h_in};
-    const int W    = {w_in};
-    const int HOUT = {h_out};
-    const int WOUT = {w_out};
-    const int K    = {k};
-    const int S    = {s};
-    const int P    = {p};
-
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx >= TOTAL) return;
-    int hw = HOUT * WOUT;
-    int co = idx / hw;
-    int rem = idx - co * hw;
-    int ho = rem / WOUT;
-    int wo = rem - ho * WOUT;
-
-    float acc = bias[co];
-    int weight_co_base = co * (CIN * K * K);
-    for (int ci = 0; ci < CIN; ci++) {{
-        int input_ci_base = ci * (H * W);
-        int weight_ci_base = weight_co_base + ci * (K * K);
-        #pragma unroll
-        for (int ki = 0; ki < K; ki++) {{
-            int hi = ho * S + ki - P;
-            if (hi < 0 || hi >= H) continue;
-            int input_row_base = input_ci_base + hi * W;
-            int weight_row_base = weight_ci_base + ki * K;
-            #pragma unroll
-            for (int kj = 0; kj < K; kj++) {{
-                int wj = wo * S + kj - P;
-                if (wj < 0 || wj >= W) continue;
-                acc += input[input_row_base + wj] * weight[weight_row_base + kj];
-            }}
-        }}
-    }}
-    out[idx] = acc;
-}}
-",
-            total = total,
-            c_in = self.c_in,
-            h_in = self.h_in,
-            w_in = self.w_in,
-            h_out = self.h_out,
-            w_out = self.w_out,
-            k = self.kernel,
-            s = self.stride,
-            p = self.padding,
-        );
-
-        let (module, func) = if let Some((m, f)) = compile_cache.get(&kernel) {
-            (m.clone(), f.clone())
-        } else {
-            let ptx = compile_module_image_for_current_device(stream.context(), &kernel).unwrap();
-            let module = stream.context().load_module(ptx).unwrap();
-            let func = module.load_function("conv2d_bias_kernel").unwrap();
-            compile_cache.insert(kernel.clone(), (module.clone(), func.clone()));
-            (module, func)
-        };
-
-        (
-            func,
-            module,
-            kernel,
-            (
-                Expression::from(grid),
-                Expression::from(1usize),
-                Expression::from(1usize),
-            ),
-            (
-                Expression::from(THREADS_PER_BLOCK),
-                Expression::from(1usize),
-                Expression::from(1usize),
-            ),
-            Expression::from(0usize),
-            FxHashMap::default(),
-        )
-    }
-
-    fn output_size(&self) -> Expression {
-        Expression::from(self.output_elements())
-    }
-
-    fn output_bytes(&self) -> Expression {
-        self.output_size() * 4
-    }
-
-    fn output_dtype(&self) -> DType {
-        DType::F32
-    }
-
-    fn bytes_loaded(&self) -> Expression {
-        // Per output: C_in * K * K input loads + same many weight loads + 1 bias load.
-        let per_out = self.c_in * self.kernel * self.kernel * 2 + 1;
-        Expression::from(self.output_elements() * per_out * 4)
-    }
-
-    fn bytes_stored(&self) -> Expression {
-        self.output_size() * 4
-    }
-
-    fn flops(&self) -> Expression {
-        // 2 * C_in * K * K mul-adds per output, plus the bias add = +1.
-        let per_out = self.c_in * self.kernel * self.kernel * 2 + 1;
-        Expression::from(self.output_elements() * per_out)
-    }
-
-    fn kernel_name(&self) -> &'static str {
-        "Conv2DBias"
-    }
-}
-
-/// luminal `CustomOp` that wraps `Conv2DKernel`. Lets us drop the kernel
-/// straight into an HLIR graph via `cx.custom_op(...)` without going
-/// through egglog rewrites.
-#[derive(Debug, Clone)]
-pub struct Conv2DCustom(pub Conv2DKernel);
-
-impl CustomOp for Conv2DCustom {
-    fn to_llir_op(&self) -> LLIROp {
-        LLIROp::new::<dyn KernelOp>(Box::new(self.0.clone()) as Box<dyn KernelOp>)
-    }
-}
-
-/// 2D conv-with-bias on a `(C_in, H, W)` F32 input tensor, with weights
-/// stored as `(C_out, C_in*K*K)` and bias as `(C_out,)`. Stride/padding/kernel
-/// are static. Output: `(C_out, H_out, W_out)`.
-///
-/// This is a thin wrapper over [`Conv2DKernel`] that hides the
-/// `cx.custom_op` plumbing. All inputs MUST be `DType::F32` and contiguous
-/// row-major; pass `tensor * 1.0_f32` first if you have a strided view.
-pub fn conv2d_bias(
-    input: GraphTensor,
-    weight: GraphTensor,
-    bias: GraphTensor,
-    kernel: usize,
-    stride: usize,
-    padding: usize,
-) -> GraphTensor {
-    assert_eq!(input.dtype, DType::F32, "conv2d_bias requires F32 input");
-    assert_eq!(weight.dtype, DType::F32, "conv2d_bias requires F32 weight");
-    assert_eq!(bias.dtype, DType::F32, "conv2d_bias requires F32 bias");
-
-    let dims = input.dims();
-    assert_eq!(dims.len(), 3, "conv2d_bias expects (C_in, H, W) input");
-    let c_in = dims[0].to_usize().expect("C_in must be a static dim");
-    let h_in = dims[1].to_usize().expect("H must be a static dim");
-    let w_in = dims[2].to_usize().expect("W must be a static dim");
-
-    let w_dims = weight.dims();
-    assert_eq!(
-        w_dims.len(),
-        2,
-        "conv2d_bias expects weight (C_out, C_in*K*K)"
-    );
-    let c_out = w_dims[0].to_usize().expect("C_out must be a static dim");
-    let w_kk = w_dims[1]
-        .to_usize()
-        .expect("weight inner dim must be static");
-    assert_eq!(
-        w_kk,
-        c_in * kernel * kernel,
-        "weight inner dim {w_kk} != C_in*K*K = {}",
-        c_in * kernel * kernel,
-    );
-
-    let b_dims = bias.dims();
-    assert_eq!(b_dims.len(), 1, "conv2d_bias expects bias (C_out,)");
-    assert_eq!(
-        b_dims[0].to_usize().expect("bias dim must be static"),
-        c_out
-    );
-
-    assert!(
-        h_in + 2 * padding >= kernel,
-        "padded H_in ({}) is smaller than kernel ({})",
-        h_in + 2 * padding,
-        kernel,
-    );
-    assert!(
-        w_in + 2 * padding >= kernel,
-        "padded W_in ({}) is smaller than kernel ({})",
-        w_in + 2 * padding,
-        kernel,
-    );
-    let h_out = (h_in + 2 * padding - kernel) / stride + 1;
-    let w_out = (w_in + 2 * padding - kernel) / stride + 1;
-
-    let kern = Conv2DKernel {
-        c_in,
-        h_in,
-        w_in,
-        c_out,
-        kernel,
-        stride,
-        padding,
-        h_out,
-        w_out,
-    };
-    let cx: &mut Graph = unsafe { &mut *input.graph_ref };
-    cx.custom_op(
-        Conv2DCustom(kern),
-        vec![input, weight, bias],
-        (c_out, h_out, w_out),
-        DType::F32,
-    )
-}
--- a/crates/luminal_cuda_lite/src/kernel/fusion/elementwise.rs
+++ b/crates/luminal_cuda_lite/src/kernel/fusion/elementwise.rs
@@ -1,378 +0,0 @@
-// =========================================================================
-// Generic CUDA elementwise ops used inside FusionStart/FusionEnd regions.
-//
-// CUDA elementwise execution is represented as a FusionEnd-rooted region even
-// for a single op. These ops are therefore region-internal only; standalone
-// compilation is intentionally unsupported.
-// =========================================================================
-
-use std::sync::Arc;
-
-use cudarc::driver::{CudaFunction, CudaModule, CudaSlice, CudaStream};
-use luminal::{
-    egglog_utils::{
-        api::{Rule, SortDef, sort},
-        base::{DTYPE, ELIST, OP_KIND, STRING},
-        extract_dtype, extract_expr_list,
-    },
-    op::*,
-    prelude::*,
-};
-
-use crate::kernel::KernelOp;
-
-pub type Ops = (CudaUnaryElementwise, CudaBinaryElementwise);
-
-type CompileOut = (
-    CudaFunction,
-    Arc<CudaModule>,
-    String,
-    (Expression, Expression, Expression),
-    (Expression, Expression, Expression),
-    Expression,
-    FxHashMap<char, CudaSlice<u8>>,
-);
-
-fn extract_string_label(egraph: &SerializedEGraph, node: &ENodeId) -> String {
-    egraph.enodes[node].0.trim_matches('"').to_string()
-}
-
-#[derive(Default, Debug, Clone)]
-pub struct CudaUnaryElementwise {
-    pub(crate) op: String,
-    pub(crate) shape: Vec<Expression>,
-    pub(crate) in_strides: Vec<Expression>,
-    pub(crate) out_strides: Vec<Expression>,
-    pub(crate) dtype: DType,
-}
-
-impl EgglogOp for CudaUnaryElementwise {
-    fn sort(&self) -> SortDef {
-        sort(
-            OP_KIND,
-            "CudaUnaryElementwise",
-            &[
-                ("op", STRING),
-                ("shape", ELIST),
-                ("strides", ELIST),
-                ("out_strides", ELIST),
-                ("dtype", DTYPE),
-            ],
-        )
-    }
-
-    fn n_inputs(&self) -> usize {
-        1
-    }
-
-    fn rewrites(&self) -> Vec<Rule> {
-        let mut rules = Vec::new();
-        for (hlir, opcode) in [
-            ("Sin", "Sin"),
-            ("Sqrt", "Sqrt"),
-            ("Exp2", "Exp2"),
-            ("Log2", "Log2"),
-            ("Recip", "Recip"),
-        ] {
-            rules.push(Rule::raw(format!(
-                "(rule (
-                    (= ?u (Op ({hlir} ?shape ?s ?out_s) (ICons ?x (INil))))
-                    (= ?dt (dtype ?u))
-                 ) (
-                    (let ?fs (Op (FusionStart ?shape ?s ?dt) (ICons ?x (INil))))
-                    (let ?elem (Op (CudaUnaryElementwise \"{opcode}\" ?shape ?s ?out_s ?dt)
-                                   (ICons ?fs (INil))))
-                    (let ?fe (Op (FusionEnd ?shape ?out_s ?dt) (ICons ?elem (INil))))
-                    (union ?u ?fe)
-                    (set (dtype ?fe) ?dt)
-                 ) :ruleset kernel_lower :name \"cuda-elem-singleton-{hlir}\")"
-            )));
-        }
-
-        rules.push(Rule::raw(
-            "(rule
-                (
-                    (= ?mul (Op (Mul ?shape ?x_stride ?const_stride ?inter_stride) (ICons ?x (ICons ?exp_const (INil)))))
-                    (= ?exp2 (Op (Exp2 ?shape ?inter_stride ?out_stride) (ICons ?mul (INil))))
-                    (= ?dt (dtype ?x))
-                    (= ?cv (Op (Constant ?val) (INil)))
-                    (= ?exp_const ?cv)
-                    (> ?val 1.44)
-                    (< ?val 1.45)
-                )
-                (
-                    (let ?fs (Op (FusionStart ?shape ?x_stride ?dt) (ICons ?x (INil))))
-                    (let ?elem (Op (CudaUnaryElementwise \"Exp\" ?shape ?x_stride ?out_stride ?dt)
-                                   (ICons ?fs (INil))))
-                    (let ?fe (Op (FusionEnd ?shape ?out_stride ?dt) (ICons ?elem (INil))))
-                    (union ?exp2 ?fe)
-                    (set (dtype ?fe) ?dt)
-                )
-                :ruleset direct_kernel
-                :name \"direct-exp-region\"
-            )",
-        ));
-
-        rules.push(Rule::raw(
-            "(datatype*
-                (CudaSigmoidScaledState
-                    (MkCudaSigmoidScaledState IR EList EList DType)
-                )
-            )
-            (function cuda_sigmoid_scaled (IR) CudaSigmoidScaledState :merge new)
-
-            (rule
-            (
-                (= ?neg1 (Op (Constant ?nv) (INil)))
-                (< ?nv -0.99)
-                (> ?nv -1.01)
-                (= ?neg_x (Op (Mul ?shape ?x_stride ?neg_stride ?neg_out_stride) (ICons ?x (ICons ?neg1 (INil)))))
-                (= ?log2e (Op (Constant ?lv) (INil)))
-                (> ?lv 1.44)
-                (< ?lv 1.45)
-                (= ?scaled (Op (Mul ?shape ?neg_out_stride ?log2e_stride ?scaled_stride) (ICons ?neg_x (ICons ?log2e (INil)))))
-                (= ?dt (dtype ?x))
-            )
-            (
-                (set (cuda_sigmoid_scaled ?scaled)
-                    (MkCudaSigmoidScaledState ?x ?shape ?x_stride ?dt))
-            )
-            :ruleset direct_kernel
-            :name \"direct-sigmoid-scaled-region-marker\"
-            )
-
-            (rule
-            (
-                (= ?scaled_state (cuda_sigmoid_scaled ?scaled))
-                (= ?scaled_state (MkCudaSigmoidScaledState ?x ?shape ?x_stride ?dt))
-                (= ?exp2 (Op (Exp2 ?shape ?scaled_stride ?exp_stride) (ICons ?scaled (INil))))
-                (= ?one (Op (Constant ?ov) (INil)))
-                (> ?ov 0.99)
-                (< ?ov 1.01)
-                (= ?plus_one (Op (Add ?shape ?exp_stride ?one_stride ?add_stride) (ICons ?exp2 (ICons ?one (INil)))))
-                (= ?sig_out (Op (Recip ?shape ?add_stride ?out_stride) (ICons ?plus_one (INil))))
-            )
-            (
-                (let ?fs (Op (FusionStart ?shape ?x_stride ?dt) (ICons ?x (INil))))
-                (let ?elem (Op (CudaUnaryElementwise \"Sigmoid\" ?shape ?x_stride ?out_stride ?dt)
-                               (ICons ?fs (INil))))
-                (let ?fe (Op (FusionEnd ?shape ?out_stride ?dt) (ICons ?elem (INil))))
-                (union ?sig_out ?fe)
-                (set (dtype ?fe) ?dt)
-            )
-            :ruleset direct_kernel
-            :name \"direct-sigmoid-region\"
-            )",
-        ));
-
-        rules
-    }
-
-    fn cleanup(&self) -> bool {
-        false
-    }
-
-    fn extract<'a>(
-        &'a self,
-        egraph: &'a SerializedEGraph,
-        kind_children: &[&'a ENodeId],
-        input_enodes: Vec<&'a ENodeId>,
-        list_cache: &mut FxHashMap<&'a ENodeId, Vec<Expression>>,
-        expr_cache: &mut FxHashMap<&'a ENodeId, Expression>,
-    ) -> (LLIROp, Vec<&'a ENodeId>) {
-        (
-            LLIROp::new::<dyn KernelOp>(Box::new(Self {
-                op: extract_string_label(egraph, kind_children[0]),
-                shape: extract_expr_list(egraph, kind_children[1], list_cache, expr_cache).unwrap(),
-                in_strides: extract_expr_list(egraph, kind_children[2], list_cache, expr_cache)
-                    .unwrap(),
-                out_strides: extract_expr_list(egraph, kind_children[3], list_cache, expr_cache)
-                    .unwrap(),
-                dtype: extract_dtype(egraph, kind_children[4]),
-            })),
-            input_enodes,
-        )
-    }
-}
-
-impl KernelOp for CudaUnaryElementwise {
-    fn compile(
-        &self,
-        _stream: &Arc<CudaStream>,
-        _compile_cache: &mut FxHashMap<String, (Arc<CudaModule>, CudaFunction)>,
-    ) -> CompileOut {
-        unreachable!("CudaUnaryElementwise must be compiled through fusion region codegen")
-    }
-
-    fn output_size(&self) -> Expression {
-        self.shape.iter().copied().product()
-    }
-
-    fn output_bytes(&self) -> Expression {
-        (self.output_size() * self.dtype.bits()).ceil_div(8)
-    }
-
-    fn bytes_loaded(&self) -> Expression {
-        self.output_bytes()
-    }
-
-    fn bytes_stored(&self) -> Expression {
-        self.output_bytes()
-    }
-
-    fn flops(&self) -> Expression {
-        self.output_size()
-    }
-
-    fn output_dtype(&self) -> DType {
-        self.dtype
-    }
-
-    fn kernel_name(&self) -> &'static str {
-        "CudaUnaryElementwise"
-    }
-}
-
-#[derive(Default, Debug, Clone)]
-pub struct CudaBinaryElementwise {
-    pub(crate) op: String,
-    pub(crate) out_shape: Vec<Expression>,
-    pub(crate) a_stride: Vec<Expression>,
-    pub(crate) b_stride: Vec<Expression>,
-    pub(crate) out_stride: Vec<Expression>,
-    pub(crate) dtype: DType,
-}
-
-impl EgglogOp for CudaBinaryElementwise {
-    fn sort(&self) -> SortDef {
-        sort(
-            OP_KIND,
-            "CudaBinaryElementwise",
-            &[
-                ("op", STRING),
-                ("shape", ELIST),
-                ("a_strides", ELIST),
-                ("b_strides", ELIST),
-                ("out_strides", ELIST),
-                ("dtype", DTYPE),
-            ],
-        )
-    }
-
-    fn n_inputs(&self) -> usize {
-        2
-    }
-
-    fn rewrites(&self) -> Vec<Rule> {
-        vec![
-            Rule::raw(
-                "(rule (
-                    (= ?bin (Op (Add ?shape ?a_s ?b_s ?out_s) (ICons ?a (ICons ?b (INil)))))
-                    (= ?dt (dtype ?bin))
-                 ) (
-                    (let ?fs_a (Op (FusionStart ?shape ?a_s ?dt) (ICons ?a (INil))))
-                    (let ?fs_b (Op (FusionStart ?shape ?b_s ?dt) (ICons ?b (INil))))
-                    (let ?elem (Op (CudaBinaryElementwise \"Add\" ?shape ?a_s ?b_s ?out_s ?dt)
-                                   (ICons ?fs_a (ICons ?fs_b (INil)))))
-                    (let ?fe (Op (FusionEnd ?shape ?out_s ?dt) (ICons ?elem (INil))))
-                    (union ?bin ?fe)
-                    (set (dtype ?fe) ?dt)
-                 ) :ruleset kernel_lower :name \"cuda-elem-singleton-Add\")",
-            ),
-            Rule::raw(
-                "(rule (
-                    (= ?bin (Op (Mul ?shape ?a_s ?b_s ?out_s) (ICons ?a (ICons ?b (INil)))))
-                    (= ?dt (dtype ?a))
-                 ) (
-                    (let ?fs_a (Op (FusionStart ?shape ?a_s ?dt) (ICons ?a (INil))))
-                    (let ?fs_b (Op (FusionStart ?shape ?b_s ?dt) (ICons ?b (INil))))
-                    (let ?elem (Op (CudaBinaryElementwise \"Mul\" ?shape ?a_s ?b_s ?out_s ?dt)
-                                   (ICons ?fs_a (ICons ?fs_b (INil)))))
-                    (let ?fe (Op (FusionEnd ?shape ?out_s ?dt) (ICons ?elem (INil))))
-                    (union ?bin ?fe)
-                    (set (dtype ?fe) ?dt)
-                 ) :ruleset kernel_lower :name \"cuda-elem-singleton-Mul\")",
-            ),
-        ]
-    }
-
-    fn cleanup(&self) -> bool {
-        false
-    }
-
-    fn extract<'a>(
-        &'a self,
-        egraph: &'a SerializedEGraph,
-        kind_children: &[&'a ENodeId],
-        input_enodes: Vec<&'a ENodeId>,
-        list_cache: &mut FxHashMap<&'a ENodeId, Vec<Expression>>,
-        expr_cache: &mut FxHashMap<&'a ENodeId, Expression>,
-    ) -> (LLIROp, Vec<&'a ENodeId>) {
-        let mut out_shape =
-            extract_expr_list(egraph, kind_children[1], list_cache, expr_cache).unwrap();
-        let mut a_stride =
-            extract_expr_list(egraph, kind_children[2], list_cache, expr_cache).unwrap();
-        let mut b_stride =
-            extract_expr_list(egraph, kind_children[3], list_cache, expr_cache).unwrap();
-        let mut out_stride =
-            extract_expr_list(egraph, kind_children[4], list_cache, expr_cache).unwrap();
-        let n = out_shape
-            .len()
-            .min(a_stride.len())
-            .min(b_stride.len())
-            .min(out_stride.len());
-        out_shape.truncate(n);
-        a_stride.truncate(n);
-        b_stride.truncate(n);
-        out_stride.truncate(n);
-        (
-            LLIROp::new::<dyn KernelOp>(Box::new(Self {
-                op: extract_string_label(egraph, kind_children[0]),
-                out_shape,
-                a_stride,
-                b_stride,
-                out_stride,
-                dtype: extract_dtype(egraph, kind_children[5]),
-            })),
-            input_enodes,
-        )
-    }
-}
-
-impl KernelOp for CudaBinaryElementwise {
-    fn compile(
-        &self,
-        _stream: &Arc<CudaStream>,
-        _compile_cache: &mut FxHashMap<String, (Arc<CudaModule>, CudaFunction)>,
-    ) -> CompileOut {
-        unreachable!("CudaBinaryElementwise must be compiled through fusion region codegen")
-    }
-
-    fn output_size(&self) -> Expression {
-        self.out_shape.iter().copied().product()
-    }
-
-    fn output_bytes(&self) -> Expression {
-        (self.output_size() * self.dtype.bits()).ceil_div(8)
-    }
-
-    fn bytes_loaded(&self) -> Expression {
-        self.output_bytes() * 2
-    }
-
-    fn bytes_stored(&self) -> Expression {
-        self.output_bytes()
-    }
-
-    fn flops(&self) -> Expression {
-        self.output_size()
-    }
-
-    fn output_dtype(&self) -> DType {
-        self.dtype
-    }
-
-    fn kernel_name(&self) -> &'static str {
-        "CudaBinaryElementwise"
-    }
-}
--- a/crates/luminal_cuda_lite/src/kernel/fusion/fused_ops.rs
+++ b/crates/luminal_cuda_lite/src/kernel/fusion/fused_ops.rs
@@ -0,0 +1,451 @@
+// =========================================================================
+// Fused elementwise op variants used inside FusionStart/FusionEnd regions.
+//
+// Each `FusedX` struct mirrors its un-fused `KernelX` sibling field-for-field
+// and serves a single purpose: give the egglog rules a distinct sort to
+// rewrite into so a pair-fuse rule's RHS can never re-match its own LHS
+// pattern. Cascade prevention by typing.
+//
+// `compile()` is a *fallback* path. The fast path collapses each FE-rooted
+// region into one CUDA kernel inside `region_codegen` and FusedX/FS/FE
+// never reach kernel_to_host's compile loop. But extraction can produce
+// LLIR shapes the detector doesn't sweep into a region, so each FusedX's
+// standalone `compile()` falls back to emitting the same kernel its
+// un-fused KernelX sibling would — correct, just one launch per op.
+// =========================================================================
+
+use std::sync::Arc;
+
+use cudarc::driver::{CudaFunction, CudaModule, CudaSlice, CudaStream};
+use luminal::{
+    egglog_utils::{
+        api::{Rule, SortDef, sort},
+        base::{DTYPE, ELIST, OP_KIND},
+        extract_dtype, extract_expr_list,
+    },
+    op::*,
+    prelude::*,
+};
+
+use crate::{
+    compile_module_image_for_current_device, cuda_dtype,
+    kernel::KernelOp,
+    kernel::hlir::{dtype_includes, generate_dyn_dims_defines},
+};
+
+pub type Ops = (
+    FusedSin,
+    FusedSqrt,
+    FusedExp,
+    FusedExp2,
+    FusedLog2,
+    FusedRecip,
+    FusedAdd,
+    FusedMul,
+);
+
+// Standard `compile()` return tuple (matches the trait signature).
+type CompileOut = (
+    CudaFunction,
+    Arc<CudaModule>,
+    String,
+    (Expression, Expression, Expression),
+    (Expression, Expression, Expression),
+    Expression,
+    FxHashMap<char, CudaSlice<u8>>,
+);
+
+// =========================================================================
+// Fallback kernel templates — used when a FusedX op reaches
+// `kernel_to_host` standalone (region detection missed it). Same CUDA as
+// the matching un-fused KernelX would emit, parameterised by the per-op
+// body expression. The fast path goes through `region_codegen`.
+// =========================================================================
+
+#[allow(clippy::too_many_arguments)]
+fn compile_unary_fallback(
+    stream: &Arc<CudaStream>,
+    compile_cache: &mut FxHashMap<String, (Arc<CudaModule>, CudaFunction)>,
+    kernel_name: &str,
+    body_expr: &str, // CUDA expression on `in[{in_idx}]`, e.g. "sinf(in[{in_idx}])"
+    shape: &[Expression],
+    in_strides: &[Expression],
+    out_strides: &[Expression],
+    dtype: DType,
+) -> CompileOut {
+    let vars = shape
+        .iter()
+        .flat_map(|e| e.dyn_vars())
+        .chain(in_strides.iter().flat_map(|e| e.dyn_vars()))
+        .chain(out_strides.iter().flat_map(|e| e.dyn_vars()))
+        .collect::<FxHashSet<_>>();
+    let cuda_ty = cuda_dtype(dtype);
+    let includes = dtype_includes(&[dtype]);
+    let (dyn_defines, _sorted_dims) = generate_dyn_dims_defines(&vars);
+    let dyn_dims_param = if vars.is_empty() {
+        ""
+    } else {
+        ", const int* dyn_dims"
+    };
+    let n_elements = shape.iter().copied().product::<Expression>().to_kernel();
+    let out_idx = flatten_strides(shape, out_strides).to_kernel();
+    let in_idx = flatten_strides(shape, in_strides).to_kernel();
+    let body = body_expr.replace("{in_idx}", &in_idx);
+    let kernel = format!(
+        "{includes}\n{dyn_defines}\nextern \"C\" {{\n\
+         \x20   __global__ void {kernel_name}({cuda_ty} *out, const {cuda_ty} *in{dyn_dims_param}) {{\n\
+         \x20       long long const_z = (long long)blockIdx.x * blockDim.x + threadIdx.x;\n\
+         \x20       if (const_z >= {n_elements}) return;\n\
+         \x20       out[{out_idx}] = {body};\n\
+         \x20   }}\n}}"
+    );
+    let (module, func) = if let Some((m, f)) = compile_cache.get(&kernel) {
+        (m.clone(), f.clone())
+    } else {
+        let ptx = compile_module_image_for_current_device(stream.context(), &kernel).unwrap();
+        let module = stream.context().load_module(ptx).unwrap();
+        let func = module.load_function(kernel_name).unwrap();
+        compile_cache.insert(kernel.clone(), (module.clone(), func.clone()));
+        (module, func)
+    };
+    let out_size = shape.iter().copied().product::<Expression>();
+    (
+        func,
+        module,
+        kernel,
+        (out_size.ceil_div(256), 1.into(), 1.into()),
+        (out_size.min(256), 1.into(), 1.into()),
+        0.into(),
+        FxHashMap::default(),
+    )
+}
+
+#[allow(clippy::too_many_arguments)]
+fn compile_binary_fallback(
+    stream: &Arc<CudaStream>,
+    compile_cache: &mut FxHashMap<String, (Arc<CudaModule>, CudaFunction)>,
+    kernel_name: &str,
+    op_str: &str, // CUDA infix operator, e.g. "+", "*"
+    out_shape: &[Expression],
+    a_stride: &[Expression],
+    b_stride: &[Expression],
+    out_stride: &[Expression],
+    dtype: DType,
+) -> CompileOut {
+    let vars = out_shape
+        .iter()
+        .flat_map(|e| e.dyn_vars())
+        .chain(a_stride.iter().flat_map(|e| e.dyn_vars()))
+        .chain(b_stride.iter().flat_map(|e| e.dyn_vars()))
+        .chain(out_stride.iter().flat_map(|e| e.dyn_vars()))
+        .collect::<FxHashSet<_>>();
+    let cuda_ty = cuda_dtype(dtype);
+    let includes = dtype_includes(&[dtype, dtype]);
+    let (dyn_defines, _sorted_dims) = generate_dyn_dims_defines(&vars);
+    let dyn_dims_param = if vars.is_empty() {
+        ""
+    } else {
+        ", const int* dyn_dims"
+    };
+    let n_elements = out_shape
+        .iter()
+        .copied()
+        .product::<Expression>()
+        .to_kernel();
+    let out_idx = flatten_strides(out_shape, out_stride).to_kernel();
+    let a_idx = flatten_strides(out_shape, a_stride).to_kernel();
+    let b_idx = flatten_strides(out_shape, b_stride).to_kernel();
+    let kernel = format!(
+        "{includes}\n{dyn_defines}\nextern \"C\" {{\n\
+         \x20   __global__ void {kernel_name}({cuda_ty} *C, const {cuda_ty} *A, const {cuda_ty} *B{dyn_dims_param}) {{\n\
+         \x20       long long const_z = (long long)blockIdx.x * blockDim.x + threadIdx.x;\n\
+         \x20       if (const_z >= {n_elements}) return;\n\
+         \x20       C[{out_idx}] = A[{a_idx}] {op_str} B[{b_idx}];\n\
+         \x20   }}\n}}"
+    );
+    let (module, func) = if let Some((m, f)) = compile_cache.get(&kernel) {
+        (m.clone(), f.clone())
+    } else {
+        let ptx = compile_module_image_for_current_device(stream.context(), &kernel).unwrap();
+        let module = stream.context().load_module(ptx).unwrap();
+        let func = module.load_function(kernel_name).unwrap();
+        compile_cache.insert(kernel.clone(), (module.clone(), func.clone()));
+        (module, func)
+    };
+    let out_size = out_shape.iter().copied().product::<Expression>();
+    (
+        func,
+        module,
+        kernel,
+        (out_size.ceil_div(256), 1.into(), 1.into()),
+        (out_size.min(256), 1.into(), 1.into()),
+        0.into(),
+        FxHashMap::default(),
+    )
+}
+
+/// Generate `pub struct $Name { … unary fields … }` plus its `EgglogOp` and
+/// `KernelOp` impls. `$kernel_name` names the CUDA function (and the cache
+/// key); `$body` is the per-op CUDA expression, e.g. `"sinf(in[{in_idx}])"`.
+macro_rules! impl_fused_unary {
+    ($Name:ident, $sort:literal, $kernel_name:literal, $body:literal) => {
+        #[derive(Default, Debug, Clone)]
+        pub struct $Name {
+            pub(crate) shape: Vec<Expression>,
+            pub(crate) in_strides: Vec<Expression>,
+            pub(crate) out_strides: Vec<Expression>,
+            pub(crate) dtype: DType,
+        }
+
+        impl EgglogOp for $Name {
+            fn sort(&self) -> SortDef {
+                sort(
+                    OP_KIND,
+                    $sort,
+                    &[
+                        ("shape", ELIST),
+                        ("strides", ELIST),
+                        ("out_strides", ELIST),
+                        ("dtype", DTYPE),
+                    ],
+                )
+            }
+            fn n_inputs(&self) -> usize {
+                1
+            }
+            fn rewrites(&self) -> Vec<Rule> {
+                Vec::new()
+            }
+            fn cleanup(&self) -> bool {
+                false
+            }
+            fn extract<'a>(
+                &'a self,
+                egraph: &'a SerializedEGraph,
+                kind_children: &[&'a ENodeId],
+                input_enodes: Vec<&'a ENodeId>,
+                list_cache: &mut FxHashMap<&'a ENodeId, Vec<Expression>>,
+                expr_cache: &mut FxHashMap<&'a ENodeId, Expression>,
+            ) -> (LLIROp, Vec<&'a ENodeId>) {
+                (
+                    LLIROp::new::<dyn KernelOp>(Box::new(Self {
+                        shape: extract_expr_list(egraph, kind_children[0], list_cache, expr_cache)
+                            .unwrap(),
+                        in_strides: extract_expr_list(
+                            egraph,
+                            kind_children[1],
+                            list_cache,
+                            expr_cache,
+                        )
+                        .unwrap(),
+                        out_strides: extract_expr_list(
+                            egraph,
+                            kind_children[2],
+                            list_cache,
+                            expr_cache,
+                        )
+                        .unwrap(),
+                        dtype: extract_dtype(egraph, kind_children[3]),
+                    })),
+                    input_enodes,
+                )
+            }
+        }
+
+        impl KernelOp for $Name {
+            fn compile(
+                &self,
+                stream: &Arc<CudaStream>,
+                compile_cache: &mut FxHashMap<String, (Arc<CudaModule>, CudaFunction)>,
+            ) -> CompileOut {
+                compile_unary_fallback(
+                    stream,
+                    compile_cache,
+                    $kernel_name,
+                    $body,
+                    &self.shape,
+                    &self.in_strides,
+                    &self.out_strides,
+                    self.dtype,
+                )
+            }
+            fn output_size(&self) -> Expression {
+                self.shape.iter().copied().product()
+            }
+            fn output_bytes(&self) -> Expression {
+                (self.output_size() * self.dtype.bits()).ceil_div(8)
+            }
+            fn bytes_loaded(&self) -> Expression {
+                self.output_bytes()
+            }
+            fn bytes_stored(&self) -> Expression {
+                self.output_bytes()
+            }
+            fn flops(&self) -> Expression {
+                self.shape.iter().copied().product()
+            }
+            fn output_dtype(&self) -> DType {
+                self.dtype
+            }
+            fn kernel_name(&self) -> &'static str {
+                $sort
+            }
+        }
+    };
+}
+
+/// As `impl_fused_unary!` but for binary ops: 5-field sort signature
+/// (shape + per-input strides + out_stride + dtype), n_inputs = 2.
+/// `$op_str` is the CUDA infix operator, e.g. `"+"`, `"*"`.
+macro_rules! impl_fused_binary {
+    ($Name:ident, $sort:literal, $kernel_name:literal, $op_str:literal) => {
+        #[derive(Default, Debug, Clone)]
+        pub struct $Name {
+            pub(crate) out_shape: Vec<Expression>,
+            pub(crate) a_stride: Vec<Expression>,
+            pub(crate) b_stride: Vec<Expression>,
+            pub(crate) out_stride: Vec<Expression>,
+            pub(crate) dtype: DType,
+        }
+
+        impl EgglogOp for $Name {
+            fn sort(&self) -> SortDef {
+                sort(
+                    OP_KIND,
+                    $sort,
+                    &[
+                        ("shape", ELIST),
+                        ("a_strides", ELIST),
+                        ("b_strides", ELIST),
+                        ("out_strides", ELIST),
+                        ("dtype", DTYPE),
+                    ],
+                )
+            }
+            fn n_inputs(&self) -> usize {
+                2
+            }
+            fn rewrites(&self) -> Vec<Rule> {
+                Vec::new()
+            }
+            fn cleanup(&self) -> bool {
+                false
+            }
+            fn extract<'a>(
+                &'a self,
+                egraph: &'a SerializedEGraph,
+                kind_children: &[&'a ENodeId],
+                input_enodes: Vec<&'a ENodeId>,
+                list_cache: &mut FxHashMap<&'a ENodeId, Vec<Expression>>,
+                expr_cache: &mut FxHashMap<&'a ENodeId, Expression>,
+            ) -> (LLIROp, Vec<&'a ENodeId>) {
+                (
+                    LLIROp::new::<dyn KernelOp>(Box::new(Self {
+                        out_shape: extract_expr_list(
+                            egraph,
+                            kind_children[0],
+                            list_cache,
+                            expr_cache,
+                        )
+                        .unwrap(),
+                        a_stride: extract_expr_list(
+                            egraph,
+                            kind_children[1],
+                            list_cache,
+                            expr_cache,
+                        )
+                        .unwrap(),
+                        b_stride: extract_expr_list(
+                            egraph,
+                            kind_children[2],
+                            list_cache,
+                            expr_cache,
+                        )
+                        .unwrap(),
+                        out_stride: extract_expr_list(
+                            egraph,
+                            kind_children[3],
+                            list_cache,
+                            expr_cache,
+                        )
+                        .unwrap(),
+                        dtype: extract_dtype(egraph, kind_children[4]),
+                    })),
+                    input_enodes,
+                )
+            }
+        }
+
+        impl KernelOp for $Name {
+            fn compile(
+                &self,
+                stream: &Arc<CudaStream>,
+                compile_cache: &mut FxHashMap<String, (Arc<CudaModule>, CudaFunction)>,
+            ) -> CompileOut {
+                compile_binary_fallback(
+                    stream,
+                    compile_cache,
+                    $kernel_name,
+                    $op_str,
+                    &self.out_shape,
+                    &self.a_stride,
+                    &self.b_stride,
+                    &self.out_stride,
+                    self.dtype,
+                )
+            }
+            fn output_size(&self) -> Expression {
+                self.out_shape.iter().copied().product()
+            }
+            fn output_bytes(&self) -> Expression {
+                (self.output_size() * self.dtype.bits()).ceil_div(8)
+            }
+            fn bytes_loaded(&self) -> Expression {
+                let bytes = (self.output_size() * self.dtype.bits()).ceil_div(8);
+                bytes + bytes
+            }
+            fn bytes_stored(&self) -> Expression {
+                self.output_bytes()
+            }
+            fn flops(&self) -> Expression {
+                self.out_shape.iter().copied().product()
+            }
+            fn output_dtype(&self) -> DType {
+                self.dtype
+            }
+            fn kernel_name(&self) -> &'static str {
+                $sort
+            }
+        }
+    };
+}
+
+impl_fused_unary!(FusedSin, "FusedSin", "fused_sin_k", "sinf(in[{in_idx}])");
+impl_fused_unary!(
+    FusedSqrt,
+    "FusedSqrt",
+    "fused_sqrt_k",
+    "sqrtf(in[{in_idx}])"
+);
+impl_fused_unary!(FusedExp, "FusedExp", "fused_exp_k", "expf(in[{in_idx}])");
+impl_fused_unary!(
+    FusedExp2,
+    "FusedExp2",
+    "fused_exp2_k",
+    "exp2f(in[{in_idx}])"
+);
+impl_fused_unary!(
+    FusedLog2,
+    "FusedLog2",
+    "fused_log2_k",
+    "log2f(in[{in_idx}])"
+);
+impl_fused_unary!(
+    FusedRecip,
+    "FusedRecip",
+    "fused_recip_k",
+    "1.0f / in[{in_idx}]"
+);
+
+impl_fused_binary!(FusedAdd, "FusedAdd", "fused_add_k", "+");
+impl_fused_binary!(FusedMul, "FusedMul", "fused_mul_k", "*");
--- a/crates/luminal_cuda_lite/src/kernel/fusion/markers.rs
+++ b/crates/luminal_cuda_lite/src/kernel/fusion/markers.rs
@@ -9,8 +9,8 @@
 //
 // `FusionEnd::rewrites()` carries the seven rule families that build and
 // extend regions (pair-fuse / grow / merge); the actual single-kernel
-// codegen lives in `region_codegen`. Both markers' `compile()` is
-// `unreachable!()` — region codegen folds them away
+// codegen lives in `region_codegen`. Like FusedX, both markers'
+// `compile()` is `unreachable!()` — region codegen folds them away
 // before kernel_to_host's compile loop reaches an interior node.
 // =========================================================================

@@ -27,7 +27,70 @@ use luminal::{
    prelude::*,
 };

-use crate::kernel::KernelOp;
+use crate::{
+    compile_module_image_for_current_device, cuda_dtype,
+    kernel::KernelOp,
+    kernel::hlir::{dtype_includes, generate_dyn_dims_defines},
+};
+
+/// Identity-memcpy kernel used as a *fallback* when a FusionStart or
+/// FusionEnd reaches `kernel_to_host`'s compile loop standalone (i.e.,
+/// region detection didn't sweep it into a `CompileUnit::Region`). The
+/// fast path is region collapse, but model-fuzz extraction sometimes
+/// produces LLIR shapes the detector doesn't catch; this keeps
+/// execution correct in those cases.
+#[allow(clippy::type_complexity)]
+fn compile_identity_kernel(
+    stream: &Arc<CudaStream>,
+    compile_cache: &mut FxHashMap<String, (Arc<CudaModule>, CudaFunction)>,
+    kernel_name: &str,
+    shape: &[Expression],
+    strides: &[Expression],
+    dtype: DType,
+) -> CompileOut {
+    let vars = shape
+        .iter()
+        .flat_map(|e| e.dyn_vars())
+        .chain(strides.iter().flat_map(|e| e.dyn_vars()))
+        .collect::<FxHashSet<_>>();
+    let cuda_ty = cuda_dtype(dtype);
+    let includes = dtype_includes(&[dtype]);
+    let (dyn_defines, _sorted_dims) = generate_dyn_dims_defines(&vars);
+    let dyn_dims_param = if vars.is_empty() {
+        ""
+    } else {
+        ", const int* dyn_dims"
+    };
+    let n_elements = shape.iter().copied().product::<Expression>().to_kernel();
+    let idx = flatten_strides(shape, strides).to_kernel();
+    let kernel = format!(
+        "{includes}\n{dyn_defines}\nextern \"C\" {{\n\
+         \x20   __global__ void {kernel_name}({cuda_ty} *out, const {cuda_ty} *in{dyn_dims_param}) {{\n\
+         \x20       long long const_z = (long long)blockIdx.x * blockDim.x + threadIdx.x;\n\
+         \x20       if (const_z >= {n_elements}) return;\n\
+         \x20       out[{idx}] = in[{idx}];\n\
+         \x20   }}\n}}"
+    );
+    let (module, func) = if let Some((m, f)) = compile_cache.get(&kernel) {
+        (m.clone(), f.clone())
+    } else {
+        let ptx = compile_module_image_for_current_device(stream.context(), &kernel).unwrap();
+        let module = stream.context().load_module(ptx).unwrap();
+        let func = module.load_function(kernel_name).unwrap();
+        compile_cache.insert(kernel.clone(), (module.clone(), func.clone()));
+        (module, func)
+    };
+    let out_size = shape.iter().copied().product::<Expression>();
+    (
+        func,
+        module,
+        kernel,
+        (out_size.ceil_div(256), 1.into(), 1.into()),
+        (out_size.min(256), 1.into(), 1.into()),
+        0.into(),
+        FxHashMap::default(),
+    )
+}

 pub type Ops = (FusionStart, FusionEnd);

@@ -96,10 +159,17 @@ impl EgglogOp for FusionStart {
 impl KernelOp for FusionStart {
    fn compile(
        &self,
-        _stream: &Arc<CudaStream>,
-        _compile_cache: &mut FxHashMap<String, (Arc<CudaModule>, CudaFunction)>,
+        stream: &Arc<CudaStream>,
+        compile_cache: &mut FxHashMap<String, (Arc<CudaModule>, CudaFunction)>,
    ) -> CompileOut {
-        unreachable!("FusionStart must be compiled through fusion region codegen")
+        compile_identity_kernel(
+            stream,
+            compile_cache,
+            "fusion_start_k",
+            &self.shape,
+            &self.strides,
+            self.dtype,
+        )
    }
    fn output_size(&self) -> Expression {
        self.shape.iter().copied().product()
@@ -113,9 +183,6 @@ impl KernelOp for FusionStart {
    fn kernel_name(&self) -> &'static str {
        "FusionStart"
    }
-    fn output_aliases_input(&self) -> Option<usize> {
-        Some(0)
-    }
 }

 // =========================================================================
@@ -142,164 +209,221 @@ impl EgglogOp for FusionEnd {
    }

    fn rewrites(&self) -> Vec<Rule> {
-        // Generic region growth works directly from HLIR elementwise ops into
-        // `Cuda*Elementwise` region nodes. The concrete HLIR op still appears in
-        // the egraph, so fusion remains a normal nondestructive alternative, but
-        // the region-internal representation is arity based instead of one
-        // dedicated fused sort per operation.
+        // Ablation switch: with `LUMINAL_DISABLE_BINARY_FUSION=1` set, do
+        // not register any fusion rules. The e-graph never sees the FS/FE
+        // bracketed alternative, extraction always picks the un-fused
+        // form, and the runtime path matches main with no fusion at all.
+        // Used to A/B fusion's runtime impact on a single binary.
+        if std::env::var("LUMINAL_DISABLE_BINARY_FUSION").is_ok() {
+            return Vec::new();
+        }
+        // Seven rule families build and extend FE-bracketed regions. Each
+        // pair-fuse rule's LHS pattern matches *un-fused* `KernelX` ops; the
+        // RHS produces `FusedX` variants in a different egglog sort, so the
+        // rule's own output cannot re-match its LHS — cascade is prevented
+        // by typing rather than by a discriminator field.
+        //
+        // Stride compatibility is expressed by reusing variable names: a
+        // unary inside a region matches `(KernelU ?shape ?s ?s ?dt)` (in =
+        // out, no transpose); a binary feeding a downstream op binds the
+        // binary's out-stride to the downstream op's in-stride along the
+        // connecting side.
        let mut rules = Vec::new();

+        // (KernelX kind, FusedX kind)
        let unaries: &[(&str, &str)] = &[
-            ("Sin", "Sin"),
-            ("Sqrt", "Sqrt"),
-            ("Exp2", "Exp2"),
-            ("Log2", "Log2"),
-            ("Recip", "Recip"),
+            ("KernelSin", "FusedSin"),
+            ("KernelSqrt", "FusedSqrt"),
+            ("KernelExp", "FusedExp"),
+            ("KernelExp2", "FusedExp2"),
+            ("KernelLog2", "FusedLog2"),
+            ("KernelRecip", "FusedRecip"),
+        ];
+        // (KernelX kind, FusedX kind, rule-name label)
+        let binaries: &[(&str, &str, &str)] = &[
+            ("KernelAdd", "FusedAdd", "Add"),
+            ("KernelMul", "FusedMul", "Mul"),
        ];
-        let binaries: &[(&str, &str)] = &[("Add", "Add"), ("Mul", "Mul")];

-        // Grow FE → unary consumer: U(FE(inner)) → FE(CudaUnary(inner)).
-        for (hlir, opcode) in unaries {
+        // 1. Pair-fuse U → U: U2(U1(x)) → FE(FU2(FU1(FS(x)))).
+        for (ki1, fi1) in unaries {
+            for (ko2, fo2) in unaries {
+                rules.push(Rule::raw(format!(
+                    "(rule (
+                        (= ?u1 (Op ({ki1} ?shape ?s ?s ?dt) (ICons ?x (INil))))
+                        (= ?u2 (Op ({ko2} ?shape ?s ?s ?dt) (ICons ?u1 (INil))))
+                     ) (
+                        (let ?fs (Op (FusionStart ?shape ?s ?dt) (ICons ?x (INil))))
+                        (let ?fu1 (Op ({fi1} ?shape ?s ?s ?dt) (ICons ?fs (INil))))
+                        (let ?fu2 (Op ({fo2} ?shape ?s ?s ?dt) (ICons ?fu1 (INil))))
+                        (let ?fe (Op (FusionEnd ?shape ?s ?dt) (ICons ?fu2 (INil))))
+                        (union ?u2 ?fe)
+                     ) :name \"pair-fuse-U-U-{ki1}-{ko2}\")"
+                )));
+            }
+        }
+
+        // 2. Pair-fuse B → U: U(B(a, b)) → FE(FU(FB(FS(a), FS(b)))).
+        for (kb, fb, lb) in binaries {
+            for (ku, fu) in unaries {
+                rules.push(Rule::raw(format!(
+                    "(rule (
+                        (= ?bin (Op ({kb} ?shape ?a_s ?b_s ?o_s ?dt)
+                                     (ICons ?a (ICons ?b (INil)))))
+                        (= ?u (Op ({ku} ?shape ?o_s ?o_s ?dt) (ICons ?bin (INil))))
+                     ) (
+                        (let ?fs_a (Op (FusionStart ?shape ?a_s ?dt) (ICons ?a (INil))))
+                        (let ?fs_b (Op (FusionStart ?shape ?b_s ?dt) (ICons ?b (INil))))
+                        (let ?fbin (Op ({fb} ?shape ?a_s ?b_s ?o_s ?dt)
+                                       (ICons ?fs_a (ICons ?fs_b (INil)))))
+                        (let ?fu (Op ({fu} ?shape ?o_s ?o_s ?dt) (ICons ?fbin (INil))))
+                        (let ?fe (Op (FusionEnd ?shape ?o_s ?dt) (ICons ?fu (INil))))
+                        (union ?u ?fe)
+                     ) :name \"pair-fuse-B-U-{lb}-{ku}\")"
+                )));
+            }
+        }
+
+        // 3. Pair-fuse U → B (lhs / rhs): unary feeds binary's A or B input.
+        //    LHS:  B(U(a), b) → FE(FB(FU(FS(a)), FS(b))).
+        //    RHS:  B(a, U(b)) → FE(FB(FS(a), FU(FS(b)))).
+        for (ku, fu) in unaries {
+            for (kb, fb, lb) in binaries {
+                rules.push(Rule::raw(format!(
+                    "(rule (
+                        (= ?u (Op ({ku} ?shape ?u_s ?u_s ?dt) (ICons ?a (INil))))
+                        (= ?bin (Op ({kb} ?shape ?u_s ?b_s ?o_s ?dt)
+                                     (ICons ?u (ICons ?b (INil)))))
+                     ) (
+                        (let ?fs_a (Op (FusionStart ?shape ?u_s ?dt) (ICons ?a (INil))))
+                        (let ?fs_b (Op (FusionStart ?shape ?b_s ?dt) (ICons ?b (INil))))
+                        (let ?fu (Op ({fu} ?shape ?u_s ?u_s ?dt) (ICons ?fs_a (INil))))
+                        (let ?fbin (Op ({fb} ?shape ?u_s ?b_s ?o_s ?dt)
+                                       (ICons ?fu (ICons ?fs_b (INil)))))
+                        (let ?fe (Op (FusionEnd ?shape ?o_s ?dt) (ICons ?fbin (INil))))
+                        (union ?bin ?fe)
+                     ) :name \"pair-fuse-U-B-lhs-{ku}-{lb}\")"
+                )));
+                rules.push(Rule::raw(format!(
+                    "(rule (
+                        (= ?u (Op ({ku} ?shape ?u_s ?u_s ?dt) (ICons ?b (INil))))
+                        (= ?bin (Op ({kb} ?shape ?a_s ?u_s ?o_s ?dt)
+                                     (ICons ?a (ICons ?u (INil)))))
+                     ) (
+                        (let ?fs_a (Op (FusionStart ?shape ?a_s ?dt) (ICons ?a (INil))))
+                        (let ?fs_b (Op (FusionStart ?shape ?u_s ?dt) (ICons ?b (INil))))
+                        (let ?fu (Op ({fu} ?shape ?u_s ?u_s ?dt) (ICons ?fs_b (INil))))
+                        (let ?fbin (Op ({fb} ?shape ?a_s ?u_s ?o_s ?dt)
+                                       (ICons ?fs_a (ICons ?fu (INil)))))
+                        (let ?fe (Op (FusionEnd ?shape ?o_s ?dt) (ICons ?fbin (INil))))
+                        (union ?bin ?fe)
+                     ) :name \"pair-fuse-U-B-rhs-{ku}-{lb}\")"
+                )));
+            }
+        }
+
+        // 4. Pair-fuse B → B (lhs / rhs): inner binary feeds outer's A or B.
+        for (kbi, fbi, lbi) in binaries {
+            for (kbo, fbo, lbo) in binaries {
+                rules.push(Rule::raw(format!(
+                    "(rule (
+                        (= ?bi (Op ({kbi} ?shape ?ai_s ?bi_s ?oi_s ?dt)
+                                    (ICons ?a (ICons ?b (INil)))))
+                        (= ?bo (Op ({kbo} ?shape ?oi_s ?co_s ?oo_s ?dt)
+                                    (ICons ?bi (ICons ?c (INil)))))
+                     ) (
+                        (let ?fs_a (Op (FusionStart ?shape ?ai_s ?dt) (ICons ?a (INil))))
+                        (let ?fs_b (Op (FusionStart ?shape ?bi_s ?dt) (ICons ?b (INil))))
+                        (let ?fs_c (Op (FusionStart ?shape ?co_s ?dt) (ICons ?c (INil))))
+                        (let ?fbi (Op ({fbi} ?shape ?ai_s ?bi_s ?oi_s ?dt)
+                                       (ICons ?fs_a (ICons ?fs_b (INil)))))
+                        (let ?fbo (Op ({fbo} ?shape ?oi_s ?co_s ?oo_s ?dt)
+                                       (ICons ?fbi (ICons ?fs_c (INil)))))
+                        (let ?fe (Op (FusionEnd ?shape ?oo_s ?dt) (ICons ?fbo (INil))))
+                        (union ?bo ?fe)
+                     ) :name \"pair-fuse-B-B-lhs-{lbi}-{lbo}\")"
+                )));
+                rules.push(Rule::raw(format!(
+                    "(rule (
+                        (= ?bi (Op ({kbi} ?shape ?ai_s ?bi_s ?oi_s ?dt)
+                                    (ICons ?a (ICons ?b (INil)))))
+                        (= ?bo (Op ({kbo} ?shape ?co_s ?oi_s ?oo_s ?dt)
+                                    (ICons ?c (ICons ?bi (INil)))))
+                     ) (
+                        (let ?fs_a (Op (FusionStart ?shape ?ai_s ?dt) (ICons ?a (INil))))
+                        (let ?fs_b (Op (FusionStart ?shape ?bi_s ?dt) (ICons ?b (INil))))
+                        (let ?fs_c (Op (FusionStart ?shape ?co_s ?dt) (ICons ?c (INil))))
+                        (let ?fbi (Op ({fbi} ?shape ?ai_s ?bi_s ?oi_s ?dt)
+                                       (ICons ?fs_a (ICons ?fs_b (INil)))))
+                        (let ?fbo (Op ({fbo} ?shape ?co_s ?oi_s ?oo_s ?dt)
+                                       (ICons ?fs_c (ICons ?fbi (INil)))))
+                        (let ?fe (Op (FusionEnd ?shape ?oo_s ?dt) (ICons ?fbo (INil))))
+                        (union ?bo ?fe)
+                     ) :name \"pair-fuse-B-B-rhs-{lbi}-{lbo}\")"
+                )));
+            }
+        }
+
+        // 5. Grow FE → U: U(FE(inner)) → FE(FU(inner)). No new FS.
+        for (ku, fu) in unaries {
            rules.push(Rule::raw(format!(
                "(rule (
                    (= ?fe (Op (FusionEnd ?shape ?s ?dt) (ICons ?inner (INil))))
-                    (= ?u (Op ({hlir} ?shape ?s ?s) (ICons ?fe (INil))))
+                    (= ?u (Op ({ku} ?shape ?s ?s ?dt) (ICons ?fe (INil))))
                 ) (
-                    (let ?elem (Op (CudaUnaryElementwise \"{opcode}\" ?shape ?s ?s ?dt)
-                                   (ICons ?inner (INil))))
-                    (let ?new_fe (Op (FusionEnd ?shape ?s ?dt) (ICons ?elem (INil))))
+                    (let ?fu (Op ({fu} ?shape ?s ?s ?dt) (ICons ?inner (INil))))
+                    (let ?new_fe (Op (FusionEnd ?shape ?s ?dt) (ICons ?fu (INil))))
                    (union ?u ?new_fe)
-                    (set (dtype ?new_fe) ?dt)
-                 ) :ruleset fusion_grow :name \"grow-FE-U-{hlir}\")"
+                 ) :name \"grow-FE-U-{ku}\")"
            )));
        }

-        // Grow FE → binary consumer, left and right orientations.
-        for (hlir, opcode) in binaries {
+        // 6. Grow FE → B (lhs / rhs): one input is the FE, the other external.
+        for (kb, fb, lb) in binaries {
            rules.push(Rule::raw(format!(
                "(rule (
                    (= ?fe (Op (FusionEnd ?shape ?a_s ?dt) (ICons ?inner_a (INil))))
-                    (= ?bin (Op ({hlir} ?shape ?a_s ?b_s ?out_s)
+                    (= ?bin (Op ({kb} ?shape ?a_s ?b_s ?o_s ?dt)
                                 (ICons ?fe (ICons ?b (INil)))))
                 ) (
                    (let ?fs_b (Op (FusionStart ?shape ?b_s ?dt) (ICons ?b (INil))))
-                    (let ?elem (Op (CudaBinaryElementwise \"{opcode}\" ?shape ?a_s ?b_s ?out_s ?dt)
+                    (let ?fbin (Op ({fb} ?shape ?a_s ?b_s ?o_s ?dt)
                                   (ICons ?inner_a (ICons ?fs_b (INil)))))
-                    (let ?new_fe (Op (FusionEnd ?shape ?out_s ?dt) (ICons ?elem (INil))))
+                    (let ?new_fe (Op (FusionEnd ?shape ?o_s ?dt) (ICons ?fbin (INil))))
                    (union ?bin ?new_fe)
-                    (set (dtype ?new_fe) ?dt)
-                 ) :ruleset fusion_grow :name \"grow-FE-B-lhs-{hlir}\")"
+                 ) :name \"grow-FE-B-lhs-{lb}\")"
            )));
            rules.push(Rule::raw(format!(
                "(rule (
                    (= ?fe (Op (FusionEnd ?shape ?b_s ?dt) (ICons ?inner_b (INil))))
-                    (= ?bin (Op ({hlir} ?shape ?a_s ?b_s ?out_s)
+                    (= ?bin (Op ({kb} ?shape ?a_s ?b_s ?o_s ?dt)
                                 (ICons ?a (ICons ?fe (INil)))))
                 ) (
                    (let ?fs_a (Op (FusionStart ?shape ?a_s ?dt) (ICons ?a (INil))))
-                    (let ?elem (Op (CudaBinaryElementwise \"{opcode}\" ?shape ?a_s ?b_s ?out_s ?dt)
+                    (let ?fbin (Op ({fb} ?shape ?a_s ?b_s ?o_s ?dt)
                                   (ICons ?fs_a (ICons ?inner_b (INil)))))
-                    (let ?new_fe (Op (FusionEnd ?shape ?out_s ?dt) (ICons ?elem (INil))))
+                    (let ?new_fe (Op (FusionEnd ?shape ?o_s ?dt) (ICons ?fbin (INil))))
                    (union ?bin ?new_fe)
-                    (set (dtype ?new_fe) ?dt)
-                 ) :ruleset fusion_grow :name \"grow-FE-B-rhs-{hlir}\")"
+                 ) :name \"grow-FE-B-rhs-{lb}\")"
            )));
        }

-        // Absorb an elementwise producer through a FusionStart boundary. This
-        // makes a region that initially treats `producer(...)` as an external
-        // input able to pull that producer inside later.
-        for (hlir, opcode) in unaries {
-            rules.push(Rule::raw(format!(
-                "(rule (
-                    (= ?u (Op ({hlir} ?shape ?s ?s) (ICons ?x (INil))))
-                    (= ?fs_u (Op (FusionStart ?shape ?s ?dt) (ICons ?u (INil))))
-                 ) (
-                    (let ?fs_x (Op (FusionStart ?shape ?s ?dt) (ICons ?x (INil))))
-                    (let ?elem (Op (CudaUnaryElementwise \"{opcode}\" ?shape ?s ?s ?dt)
-                                   (ICons ?fs_x (INil))))
-                    (union ?fs_u ?elem)
-                 ) :ruleset fusion_grow :name \"grow-U-FS-{hlir}\")"
-            )));
-            rules.push(Rule::raw(format!(
-                "(rule (
-                    (= ?inner_fe (Op (FusionEnd ?shape ?s ?dt) (ICons ?inner (INil))))
-                    (= ?bad_fs (Op (FusionStart ?shape ?s ?dt) (ICons ?inner_fe (INil))))
-                    (= ?bad_elem (Op (CudaUnaryElementwise \"{opcode}\" ?shape ?s ?s ?dt)
-                                     (ICons ?bad_fs (INil))))
-                    (= ?bad_fe (Op (FusionEnd ?shape ?s ?dt) (ICons ?bad_elem (INil))))
-                    (= ?good_elem (Op (CudaUnaryElementwise \"{opcode}\" ?shape ?s ?s ?dt)
-                                      (ICons ?inner (INil))))
-                    (= ?good_fe (Op (FusionEnd ?shape ?s ?dt) (ICons ?good_elem (INil))))
-                    (= ?bad_fe ?good_fe)
-                 ) (
-                    (delete (Op (FusionStart ?shape ?s ?dt) (ICons ?inner_fe (INil))))
-                 ) :ruleset cleanup :name \"cleanup-nested-FS-FE-unary-{hlir}\")"
-            )));
-        }
-        for (hlir, opcode) in binaries {
-            rules.push(Rule::raw(format!(
-                "(rule (
-                    (= ?bin (Op ({hlir} ?shape ?a_s ?b_s ?out_s)
-                                 (ICons ?a (ICons ?b (INil)))))
-                    (= ?fs_bin (Op (FusionStart ?shape ?out_s ?dt) (ICons ?bin (INil))))
-                 ) (
-                    (let ?fs_a (Op (FusionStart ?shape ?a_s ?dt) (ICons ?a (INil))))
-                    (let ?fs_b (Op (FusionStart ?shape ?b_s ?dt) (ICons ?b (INil))))
-                    (let ?elem (Op (CudaBinaryElementwise \"{opcode}\" ?shape ?a_s ?b_s ?out_s ?dt)
-                                   (ICons ?fs_a (ICons ?fs_b (INil)))))
-                    (union ?fs_bin ?elem)
-                 ) :ruleset fusion_grow :name \"grow-B-FS-{hlir}\")"
-            )));
-            rules.push(Rule::raw(format!(
-                "(rule (
-                    (= ?inner_fe (Op (FusionEnd ?shape ?a_s ?dt) (ICons ?inner_a (INil))))
-                    (= ?bad_fs (Op (FusionStart ?shape ?a_s ?dt) (ICons ?inner_fe (INil))))
-                    (= ?fs_b (Op (FusionStart ?shape ?b_s ?dt) (ICons ?b (INil))))
-                    (= ?bad_elem (Op (CudaBinaryElementwise \"{opcode}\" ?shape ?a_s ?b_s ?out_s ?dt)
-                                     (ICons ?bad_fs (ICons ?fs_b (INil)))))
-                    (= ?bad_fe (Op (FusionEnd ?shape ?out_s ?dt) (ICons ?bad_elem (INil))))
-                    (= ?good_elem (Op (CudaBinaryElementwise \"{opcode}\" ?shape ?a_s ?b_s ?out_s ?dt)
-                                      (ICons ?inner_a (ICons ?fs_b (INil)))))
-                    (= ?good_fe (Op (FusionEnd ?shape ?out_s ?dt) (ICons ?good_elem (INil))))
-                    (= ?bad_fe ?good_fe)
-                 ) (
-                    (delete (Op (FusionStart ?shape ?a_s ?dt) (ICons ?inner_fe (INil))))
-                 ) :ruleset cleanup :name \"cleanup-nested-FS-FE-binary-lhs-{hlir}\")"
-            )));
-            rules.push(Rule::raw(format!(
-                "(rule (
-                    (= ?inner_fe (Op (FusionEnd ?shape ?b_s ?dt) (ICons ?inner_b (INil))))
-                    (= ?bad_fs (Op (FusionStart ?shape ?b_s ?dt) (ICons ?inner_fe (INil))))
-                    (= ?fs_a (Op (FusionStart ?shape ?a_s ?dt) (ICons ?a (INil))))
-                    (= ?bad_elem (Op (CudaBinaryElementwise \"{opcode}\" ?shape ?a_s ?b_s ?out_s ?dt)
-                                     (ICons ?fs_a (ICons ?bad_fs (INil)))))
-                    (= ?bad_fe (Op (FusionEnd ?shape ?out_s ?dt) (ICons ?bad_elem (INil))))
-                    (= ?good_elem (Op (CudaBinaryElementwise \"{opcode}\" ?shape ?a_s ?b_s ?out_s ?dt)
-                                      (ICons ?fs_a (ICons ?inner_b (INil)))))
-                    (= ?good_fe (Op (FusionEnd ?shape ?out_s ?dt) (ICons ?good_elem (INil))))
-                    (= ?bad_fe ?good_fe)
-                 ) (
-                    (delete (Op (FusionStart ?shape ?b_s ?dt) (ICons ?inner_fe (INil))))
-                 ) :ruleset cleanup :name \"cleanup-nested-FS-FE-binary-rhs-{hlir}\")"
-            )));
-        }
-
-        // Merge two FEs at a binary: B(FE(ia), FE(ib)) → FE(CudaBinary(ia, ib)).
-        for (hlir, opcode) in binaries {
+        // 7. Merge two FEs at a binary: B(FE(ia), FE(ib)) → FE(FB(ia, ib)).
+        //    Both inners reused, no new FS — shared external tensors with
+        //    upstream FSes stay at one FS.
+        for (kb, fb, lb) in binaries {
            rules.push(Rule::raw(format!(
                "(rule (
                    (= ?fe_a (Op (FusionEnd ?shape ?a_s ?dt) (ICons ?inner_a (INil))))
                    (= ?fe_b (Op (FusionEnd ?shape ?b_s ?dt) (ICons ?inner_b (INil))))
-                    (= ?bin (Op ({hlir} ?shape ?a_s ?b_s ?out_s)
+                    (= ?bin (Op ({kb} ?shape ?a_s ?b_s ?o_s ?dt)
                                 (ICons ?fe_a (ICons ?fe_b (INil)))))
                 ) (
-                    (let ?elem (Op (CudaBinaryElementwise \"{opcode}\" ?shape ?a_s ?b_s ?out_s ?dt)
+                    (let ?fbin (Op ({fb} ?shape ?a_s ?b_s ?o_s ?dt)
                                   (ICons ?inner_a (ICons ?inner_b (INil)))))
-                    (let ?new_fe (Op (FusionEnd ?shape ?out_s ?dt) (ICons ?elem (INil))))
+                    (let ?new_fe (Op (FusionEnd ?shape ?o_s ?dt) (ICons ?fbin (INil))))
                    (union ?bin ?new_fe)
-                    (set (dtype ?new_fe) ?dt)
-                 ) :ruleset fusion_merge :name \"merge-FE-FE-{hlir}\")"
+                 ) :name \"merge-FE-FE-{lb}\")"
            )));
        }

@@ -339,10 +463,17 @@ impl EgglogOp for FusionEnd {
 impl KernelOp for FusionEnd {
    fn compile(
        &self,
-        _stream: &Arc<CudaStream>,
-        _compile_cache: &mut FxHashMap<String, (Arc<CudaModule>, CudaFunction)>,
+        stream: &Arc<CudaStream>,
+        compile_cache: &mut FxHashMap<String, (Arc<CudaModule>, CudaFunction)>,
    ) -> CompileOut {
-        unreachable!("FusionEnd must be compiled through fusion region codegen")
+        compile_identity_kernel(
+            stream,
+            compile_cache,
+            "fusion_end_k",
+            &self.shape,
+            &self.strides,
+            self.dtype,
+        )
    }
    fn output_size(&self) -> Expression {
        self.shape.iter().copied().product()
--- a/crates/luminal_cuda_lite/src/kernel/fusion/mod.rs
+++ b/crates/luminal_cuda_lite/src/kernel/fusion/mod.rs
@@ -2,21 +2,25 @@
 //!
 //! - `markers` — `FusionStart` / `FusionEnd` ops + the seven egglog rule
 //!   families that build and extend FE-bracketed regions.
-//! - `elementwise` — generic region-internal CUDA elementwise op variants.
+//! - `fused_ops` — eight `FusedX` op variants (interior to a region) so
+//!   pair-fuse rules' RHS sit in a different egglog sort than their LHS,
+//!   blocking cascade by typing.
 //! - `region_codegen` — `kernel_to_host` calls into here to collapse each
 //!   FE-rooted region into a single CUDA kernel at compile time.
 //!
-//! The LLIR keeps `FusionStart` / generic elementwise / `FusionEnd` nodes after
+//! The LLIR keeps `FusionStart` / `FusedX` / `FusionEnd` nodes after
 //! extraction; `region_codegen` is the only place that walks them.

-pub mod elementwise;
+pub mod fused_ops;
 pub mod markers;
 pub mod region_codegen;

-pub use elementwise::{CudaBinaryElementwise, CudaUnaryElementwise};
+pub use fused_ops::{
+    FusedAdd, FusedExp, FusedExp2, FusedLog2, FusedMul, FusedRecip, FusedSin, FusedSqrt,
+};
 pub use markers::{FusionEnd, FusionStart};

 /// All fusion-related op types that the egglog runtime needs to know about
-/// (markers + interior generic elementwise variants). Combined into a flat
-/// tuple for the `Ops` registry in `kernel::mod`.
-pub type Ops = (markers::Ops, elementwise::Ops);
+/// (markers + interior FusedX variants). Combined into a flat tuple for the
+/// `Ops` registry in `kernel::mod`.
+pub type Ops = (markers::Ops, fused_ops::Ops);
--- a/crates/luminal_cuda_lite/src/kernel/fusion/region_codegen.rs
+++ b/crates/luminal_cuda_lite/src/kernel/fusion/region_codegen.rs
@@ -1,26 +1,26 @@
 // =========================================================================
 // Region codegen for FusionStart / FusionEnd-bracketed fused regions.
 //
-// Older fusion lowering left elementwise / FusionStart / FusionEnd nodes in the post-extraction
+// PR1 left FusedX / FusionStart / FusionEnd nodes in the post-extraction
 // LLIR, each compiling to its own standalone CUDA kernel. PR2 collapses
 // every FusionEnd-rooted region into ONE fused CUDA kernel at codegen
 // time — without rewriting the LLIR.
 //
 // Pipeline:
 //   `kernel_to_host` builds a Vec<CompileUnit> from the topo order:
-//     - CompileUnit::Single(node)  — unfused non-region kernels, compiled as before.
-//     - CompileUnit::Region(rgn)   — one FE + its interior elementwise DAG +
+//     - CompileUnit::Single(node)  — un-fused KernelX, compiled as before.
+//     - CompileUnit::Region(rgn)   — one FE + its interior FusedX DAG +
 //                                    its FS leaves. Compiled here as a
 //                                    single CUDA kernel that reads from
 //                                    the region's external inputs once,
-//                                    chains all elementwise bodies through
+//                                    chains all FusedX bodies through
 //                                    register-resident locals, and writes
 //                                    the FE's output.
 //
 // The CompiledKernel for a Region is keyed on the FE node and stores
 // `inputs = external producer NodeIndices` (one per interior FusionStart),
 // so the existing buffer-pointer wiring in to_host.rs picks up the right
-// device pointers at execute time. Interior Cuda*Elementwise / FusionStart nodes
+// device pointers at execute time. Interior FusedX / FusionStart nodes
 // never enter the kernels Vec — they have no buffers, no launches.
 // =========================================================================

@@ -40,7 +40,6 @@ use as_any::Downcast;
 use crate::{
    compile_module_image_for_current_device, cuda_dtype,
    kernel::KernelOp,
-    kernel::fusion::elementwise::{CudaBinaryElementwise, CudaUnaryElementwise},
    kernel::fusion::markers::{FusionEnd, FusionStart},
    kernel::hlir::{dtype_includes, generate_dyn_dims_defines},
 };
@@ -53,10 +52,10 @@ use crate::{
 pub(crate) struct RegionUnit {
    /// The FusionEnd node that anchors this region.
    pub fe_node: NodeIndex,
-    /// Interior Cuda*Elementwise nodes, in topological order (predecessors before
+    /// Interior FusedX nodes, in topological order (predecessors before
    /// consumers). Used to emit register-binding statements in dependency
    /// order in the fused CUDA kernel body.
-    pub elementwise_topo: Vec<NodeIndex>,
+    pub fusedx_topo: Vec<NodeIndex>,
    /// FusionStart nodes that bound the region's leaves. One per external
    /// read site — duplicates (different FS LLIR nodes wrapping the same
    /// upstream tensor) are kept separate so each read uses its own
@@ -80,13 +79,13 @@ pub(crate) enum CompileUnit {

 /// Group a sub-DAG's topo order into compile units. Each FusionEnd node
 /// becomes the root of a `CompileUnit::Region`; the region's interior
-/// Cuda*Elementwise and FusionStart nodes are absorbed into that region and removed
+/// FusedX and FusionStart nodes are absorbed into that region and removed
 /// from the per-node iteration. Anything else is wrapped in
 /// `CompileUnit::Single`.
 /// Globally-absorbed FS / FE markers — the set of marker nodes that any
 /// `FusionEnd` in the LLIR walks back to during region detection. A
 /// marker is "absorbed" iff some FE in the LLIR can reach it by walking
-/// incoming edges through `FusionEnd` / Cuda*Elementwise nodes, stopping at
+/// incoming edges through `FusionEnd` / `FusedX` nodes, stopping at
 /// `FusionStart` leaves.
 ///
 /// This is computed once over the full LLIR rather than per-convex-
@@ -94,8 +93,9 @@ pub(crate) enum CompileUnit {
 /// (one whose e-graph congruence-deduplicated it across multiple
 /// regions) into a different subgraph than the FE that absorbs it.
 /// Without this global view, `build_compile_units` running on the FS's
-/// subgraph would not see any FE walking back to the FS and would emit the
-/// FS as `CompileUnit::Single`; marker standalone compilation is not supported.
+/// subgraph would not see any FE walking back to the FS, would emit the
+/// FS as `CompileUnit::Single`, and the markers' identity-memcpy
+/// fallback would compile and launch — pure overhead at runtime.
 pub(crate) fn globally_absorbed_markers(llir_graph: &LLIRGraph) -> FxHashSet<NodeIndex> {
    let name_of = |idx: NodeIndex| -> Option<&'static str> {
        llir_graph
@@ -124,7 +124,7 @@ pub(crate) fn globally_absorbed_markers(llir_graph: &LLIRGraph) -> FxHashSet<Nod
                        absorbed.insert(pred);
                        stack.push(pred);
                    }
-                    Some(_) if is_region_elementwise(llir_graph, pred) => {
+                    Some(other) if other.starts_with("Fused") => {
                        absorbed.insert(pred);
                        stack.push(pred);
                    }
@@ -188,18 +188,19 @@ pub(crate) fn build_compile_units(
                        absorbed.insert(pred);
                        stack.push(pred);
                    }
-                    Some(_) if is_region_elementwise(llir_graph, pred) => {
+                    Some(other) if other.starts_with("Fused") => {
                        interior.push(pred);
                        stack.push(pred);
                    }
                    _ => {
-                        // Non-marker, non-elementwise predecessor inside what
+                        // Non-marker, non-FusedX predecessor inside what
                        // we thought was a region. Shouldn't happen with
                        // the current rules; treat conservatively: do
-                        // not absorb it. This means the region is
+                        // not absorb — let the kernel_to_host single
+                        // path handle it. This means the region is
                        // malformed and we likely should not have a
-                        // region at all; caller will see incomplete
-                        // interior.
+                        // region at all. Caller will see incomplete
+                        // interior; the safer thing is to fall back.
                    }
                }
            }
@@ -230,56 +231,7 @@ pub(crate) fn build_compile_units(
                llir_graph
                    .neighbors_directed(fs, Direction::Incoming)
                    .next()
-                    .unwrap_or_else(|| {
-                        // Dump the malformed structure: which FE
-                        // triggered the walk, every node in fs_topo and
-                        // interior_topo, and each FS's incoming /
-                        // outgoing degree. Helps localize whether the
-                        // missing edge came from extraction or a
-                        // downstream LLIR transform.
-                        if std::env::var("LUMINAL_DEBUG_FUSION_PANIC").is_ok() {
-                            eprintln!(
-                                "FusionStart panic: fe={} (kernel={:?})",
-                                node.index(),
-                                llir_graph.node_weight(node).and_then(|op| {
-                                    op.to_dialect::<dyn KernelOp>().map(|k| k.kernel_name())
-                                }),
-                            );
-                            eprintln!("  fs_topo ({}):", fs_topo.len());
-                            for &f in &fs_topo {
-                                let in_deg = llir_graph
-                                    .neighbors_directed(f, Direction::Incoming)
-                                    .count();
-                                let out_deg = llir_graph
-                                    .neighbors_directed(f, Direction::Outgoing)
-                                    .count();
-                                let kn = llir_graph
-                                    .node_weight(f)
-                                    .and_then(|op| {
-                                        op.to_dialect::<dyn KernelOp>().map(|k| k.kernel_name())
-                                    })
-                                    .unwrap_or("?");
-                                eprintln!(
-                                    "    fs={} kind={} in_deg={} out_deg={}",
-                                    f.index(),
-                                    kn,
-                                    in_deg,
-                                    out_deg,
-                                );
-                            }
-                            eprintln!("  interior_topo ({}):", interior_topo.len());
-                            for &i in &interior_topo {
-                                let kn = llir_graph
-                                    .node_weight(i)
-                                    .and_then(|op| {
-                                        op.to_dialect::<dyn KernelOp>().map(|k| k.kernel_name())
-                                    })
-                                    .unwrap_or("?");
-                                eprintln!("    interior={} kind={}", i.index(), kn);
-                            }
-                        }
-                        panic!("FusionStart with no predecessor")
-                    })
+                    .expect("FusionStart with no predecessor")
            })
            .collect();

@@ -290,7 +242,7 @@ pub(crate) fn build_compile_units(
            node,
            RegionUnit {
                fe_node: node,
-                elementwise_topo: interior_topo,
+                fusedx_topo: interior_topo,
                fs_nodes: fs_topo,
                external_inputs,
            },
@@ -301,10 +253,11 @@ pub(crate) fn build_compile_units(
    // FE nodes with their RegionUnit and skipping anything absorbed —
    // either by a region in *this* subgraph (`absorbed`) or by any
    // region anywhere in the LLIR (`globally_absorbed`). Skipping the
-    // latter prevents shared FS markers whose consumers live in other
-    // convex subgraphs from being emitted as standalone compile units:
+    // latter prevents the identity-memcpy fallback from firing on
+    // shared FS markers whose consumers live in other convex subgraphs:
    // those FSes are absorbed by some other region, and the consuming
-    // region reads from FS's external producer.
+    // region reads from FS's external producer, so the FS never needs
+    // its own kernel.
    let mut units: Vec<CompileUnit> = Vec::new();
    for &node in topo_order {
        if let Some(region) = regions.remove(&node) {
@@ -319,53 +272,24 @@ pub(crate) fn build_compile_units(
 }

 // =========================================================================
-// Per-elementwise body templates.
+// Per-FusedX body templates.
 //
 // Each entry takes the names of the local variables holding the op's
 // inputs and returns a CUDA expression evaluating to the op's output
 // (a register-resident value, no buffer involved).
 // =========================================================================

-fn is_region_elementwise(llir_graph: &LLIRGraph, node: NodeIndex) -> bool {
-    llir_graph
-        .node_weight(node)
-        .and_then(|op| op.to_dialect::<dyn KernelOp>())
-        .is_some_and(|op| {
-            (***op).downcast_ref::<CudaUnaryElementwise>().is_some()
-                || (***op).downcast_ref::<CudaBinaryElementwise>().is_some()
-        })
-}
-
-fn elementwise_value(local: &str, dtype: DType) -> String {
-    if matches!(dtype, DType::F8E4M3 | DType::F8E5M2 | DType::F8UE8M0) {
-        format!("static_cast<float>({local})")
-    } else {
-        local.to_string()
-    }
-}
-
-fn elementwise_init_expr(expr: &str, dtype: DType, cuda_ty: &str) -> String {
-    if matches!(dtype, DType::F8E4M3 | DType::F8E5M2 | DType::F8UE8M0) {
-        format!("{cuda_ty}({expr})")
-    } else {
-        expr.to_string()
-    }
-}
-
-fn elementwise_body(op: &str, locals: &[&str], dtype: DType) -> String {
-    let a = || elementwise_value(locals[0], dtype);
-    let b = || elementwise_value(locals[1], dtype);
-    match op {
-        "Sin" => format!("sinf({})", a()),
-        "Sqrt" => format!("sqrtf({})", a()),
-        "Exp" => format!("expf({})", a()),
-        "Exp2" => format!("exp2f({})", a()),
-        "Log2" => format!("log2f({})", a()),
-        "Recip" => format!("1.0f / {}", a()),
-        "Sigmoid" => format!("1.0f / (1.0f + expf(-{}))", a()),
-        "Add" => format!("{} + {}", a(), b()),
-        "Mul" => format!("{} * {}", a(), b()),
-        other => panic!("region_codegen: unknown elementwise op {other}"),
+fn fused_body(name: &str, locals: &[&str]) -> String {
+    match name {
+        "FusedSin" => format!("sinf({})", locals[0]),
+        "FusedSqrt" => format!("sqrtf({})", locals[0]),
+        "FusedExp" => format!("expf({})", locals[0]),
+        "FusedExp2" => format!("exp2f({})", locals[0]),
+        "FusedLog2" => format!("log2f({})", locals[0]),
+        "FusedRecip" => format!("1.0f / {}", locals[0]),
+        "FusedAdd" => format!("{} + {}", locals[0], locals[1]),
+        "FusedMul" => format!("{} * {}", locals[0], locals[1]),
+        other => panic!("region_codegen: unknown FusedX op {other}"),
    }
 }

@@ -403,7 +327,7 @@ pub(crate) fn compile_region(
    let dtype: DType = fe_struct.dtype;

    // Aggregate all dynamic vars used anywhere in the region (FS strides,
-    // FE strides and elementwise shapes.
+    // FE strides, FusedX shape — all FusedX share `out_shape`, but their
    // own strides are likewise relevant for any future stride-affine ops).
    let mut all_vars: FxHashSet<char> = FxHashSet::default();
    all_vars.extend(out_shape.iter().flat_map(|e| e.dyn_vars()));
@@ -413,19 +337,6 @@ pub(crate) fn compile_region(
        let fs_struct: &FusionStart = (***fs_op).downcast_ref::<FusionStart>().unwrap();
        all_vars.extend(fs_struct.strides.iter().flat_map(|e| e.dyn_vars()));
    }
-    for &elem_idx in &region.elementwise_topo {
-        let elem_op = llir_graph[elem_idx].to_dialect::<dyn KernelOp>().unwrap();
-        if let Some(elem) = (***elem_op).downcast_ref::<CudaUnaryElementwise>() {
-            all_vars.extend(elem.shape.iter().flat_map(|e| e.dyn_vars()));
-            all_vars.extend(elem.in_strides.iter().flat_map(|e| e.dyn_vars()));
-            all_vars.extend(elem.out_strides.iter().flat_map(|e| e.dyn_vars()));
-        } else if let Some(elem) = (***elem_op).downcast_ref::<CudaBinaryElementwise>() {
-            all_vars.extend(elem.out_shape.iter().flat_map(|e| e.dyn_vars()));
-            all_vars.extend(elem.a_stride.iter().flat_map(|e| e.dyn_vars()));
-            all_vars.extend(elem.b_stride.iter().flat_map(|e| e.dyn_vars()));
-            all_vars.extend(elem.out_stride.iter().flat_map(|e| e.dyn_vars()));
-        }
-    }

    let cuda_ty = cuda_dtype(dtype);
    let includes = dtype_includes(&[dtype]);
@@ -451,19 +362,19 @@ pub(crate) fn compile_region(
    }
    let signature = signature_params.join(", ");

-    // Body: read FS leaves, then walk elementwise nodes in topo order emitting a
+    // Body: read FS leaves, then walk FusedX in topo order emitting a
    // local per op, then write FE output. Every node gets a local keyed
    // by a position-in-region index so the kernel string is invariant
    // under NodeIndex churn (each `egglog_to_llir` reissues NodeIndexes,
    // so naming locals by `n.index()` would invalidate the kernel
    // string cache on every search candidate). Indices: FS leaves get
-    // 0..fs_nodes.len(), elementwise nodes get fs_nodes.len()..(+ elementwise_topo.len()).
+    // 0..fs_nodes.len(), FusedX get fs_nodes.len()..(+ fusedx_topo.len()).
    let mut local_idx_map: FxHashMap<NodeIndex, usize> = FxHashMap::default();
    for (i, &fs_idx) in region.fs_nodes.iter().enumerate() {
        local_idx_map.insert(fs_idx, i);
    }
    let fs_count = region.fs_nodes.len();
-    for (i, &op_idx) in region.elementwise_topo.iter().enumerate() {
+    for (i, &op_idx) in region.fusedx_topo.iter().enumerate() {
        local_idx_map.insert(op_idx, fs_count + i);
    }
    let local_name = |n: NodeIndex| format!("v_{}", local_idx_map[&n]);
@@ -486,22 +397,12 @@ pub(crate) fn compile_region(
        ));
    }

-    // Elementwise ops in topo order. Each looks up its predecessor locals
+    // FusedX ops in topo order. Each looks up its predecessor locals
    // (in incoming-edge id order to match the original op's input
    // arity / position).
-    for &op_idx in &region.elementwise_topo {
+    for &op_idx in &region.fusedx_topo {
        let op_ref = llir_graph[op_idx].to_dialect::<dyn KernelOp>().unwrap();
-        let (elem_name, elem_dtype) =
-            if let Some(elem) = (***op_ref).downcast_ref::<CudaUnaryElementwise>() {
-                (elem.op.as_str(), elem.dtype)
-            } else if let Some(elem) = (***op_ref).downcast_ref::<CudaBinaryElementwise>() {
-                (elem.op.as_str(), elem.dtype)
-            } else {
-                panic!(
-                    "region_codegen: expected Cuda*Elementwise op, got {}",
-                    op_ref.kernel_name()
-                );
-            };
+        let op_name = op_ref.kernel_name();

        let mut input_locals: Vec<String> = llir_graph
            .edges_directed(op_idx, Direction::Incoming)
@@ -520,16 +421,15 @@ pub(crate) fn compile_region(
        input_locals = edges.into_iter().map(|(_, src)| local_name(src)).collect();
        let inputs_ref: Vec<&str> = input_locals.iter().map(|s| s.as_str()).collect();

-        let expr = elementwise_body(elem_name, &inputs_ref, elem_dtype);
-        let expr = elementwise_init_expr(&expr, elem_dtype, cuda_ty);
+        let expr = fused_body(op_name, &inputs_ref);
        body.push_str(&format!(
            "        {cuda_ty} {name} = {expr};\n",
            name = local_name(op_idx),
        ));
    }

-    // FE write: pick the elementwise node feeding FE (its single incoming edge in
-    // the region — an elementwise node or, in degenerate single-FS regions which
+    // FE write: pick the FusedX feeding FE (its single incoming edge in
+    // the region — a FusedX or, in degenerate single-FS regions which
    // shouldn't arise, an FS).
    let fe_input: NodeIndex = llir_graph
        .neighbors_directed(region.fe_node, Direction::Incoming)
@@ -577,63 +477,3 @@ pub(crate) fn compile_region(
        constants: FxHashMap::default(),
    }
 }
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::kernel::fusion::elementwise::CudaBinaryElementwise;
-    use luminal::op::LLIROp;
-    use luminal::prelude::petgraph::algo::toposort;
-
-    /// Helper: wrap a `KernelOp` in an `LLIROp` of the kernel dialect.
-    fn llir_of(op: impl KernelOp + 'static) -> LLIROp {
-        LLIROp::new::<dyn KernelOp>(Box::new(op) as Box<dyn KernelOp>)
-    }
-
-    /// Reproducer for the `FusionStart with no predecessor` panic at
-    /// `region_codegen.rs:232`. The egglog rolling pass + iterated mode
-    /// (`LUMINAL_LOOP_ROLL_ITERATE=1`) has been observed to produce LLIR
-    /// graphs where a `FusionStart` marker is reached as a region leaf
-    /// during the FE→FS walk but has no incoming edge — meaning the
-    /// region has nothing to read from. `build_compile_units` then
-    /// panics when constructing `external_inputs` because every FS leaf
-    /// is required to have exactly one external producer.
-    ///
-    /// Until that path is fixed, this test pins the failure mode so a
-    /// regression doesn't silently change the panic message or location.
-    /// `should_panic` rather than `ignore` so it stays runnable in CI
-    /// and surfaces if the panic ever moves.
-    #[test]
-    #[should_panic(expected = "FusionStart with no predecessor")]
-    fn fusion_start_with_no_predecessor_panics() {
-        // Minimal reproducer:
-        //
-        //   (no input) ──▶ FusionStart ──▶ CudaBinaryElementwise ──▶ FusionEnd
-        //
-        // CudaBinaryElementwise is a binary op (n_inputs = 2) so a real region would
-        // have two FS leaves. For this panic-shape test only the *first*
-        // FS leaf needs a missing predecessor — `build_compile_units`
-        // panics in `expect("FusionStart with no predecessor")` as soon
-        // as any FS in `fs_topo` lacks one. We add only one FS edge so
-        // CudaBinaryElementwise has a dangling second input slot, but that's fine:
-        // we're testing the specific panic path inside `build_compile_units`,
-        // not full kernel codegen.
-        let mut llir: LLIRGraph = LLIRGraph::default();
-
-        let fs_node = llir.add_node(llir_of(FusionStart::default()));
-        let fadd_node = llir.add_node(llir_of(CudaBinaryElementwise::default()));
-        let fe_node = llir.add_node(llir_of(FusionEnd::default()));
-
-        // FusionStart → CudaBinaryElementwise → FusionEnd.
-        llir.add_edge(fs_node, fadd_node, ());
-        llir.add_edge(fadd_node, fe_node, ());
-
-        let topo = toposort(&llir, None).expect("LLIR cycle in test setup");
-        let absorbed = globally_absorbed_markers(&llir);
-
-        // This is the call that panics with `FusionStart with no
-        // predecessor` because `fs_node`'s incoming-edges iterator is
-        // empty.
-        let _ = build_compile_units(&topo, &llir, &absorbed);
-    }
-}
--- a/crates/luminal_cuda_lite/src/kernel/hlir.rs
+++ b/crates/luminal_cuda_lite/src/kernel/hlir.rs
--- a/crates/luminal_cuda_lite/src/kernel/matmul2d.rs
+++ b/crates/luminal_cuda_lite/src/kernel/matmul2d.rs
@@ -1,427 +0,0 @@
-//! Direct 2D matmul kernel — bypasses egglog rewrites, used as a custom op
-//! for matmul shapes where the cublaslt egg rules don't reliably fire.
-//!
-//! The cublaslt 2D rules in `host/cublaslt/cublaslt_*Cm_rewrite.egg` /
-//! `cublaslt_Rm*_rewrite.egg` are *supposed* to match any 2D matmul whose
-//! Mul + SumReduce broadcast lowering has the expected stride patterns,
-//! and the conditional matmul cleanup is *supposed* to delete the
-//! elementwise Mul + KernelSumReduce fallback whenever a cublaslt alternative
-//! exists. In practice both fail to fire reliably for the VAE's mid-block
-//! `AttnBlock` matmuls — at 1024² that lets the search occasionally pick
-//! the broadcast-Mul path for `q @ kᵀ`, generating a `(HW, HW, C) =
-//! (16384, 16384, 512)` ≈ 524 GiB single intermediate that OOMs the GPU.
-//!
-//! Same approach as `kernel::conv2d`: define a `KernelOp`, wrap it in a
-//! `CustomOp`, expose a tiny `pub fn` so callers don't see the
-//! `cx.custom_op` plumbing. This is opaque to egglog by design — we
-//! aren't trying to fuse with surrounding ops, just guarantee a sane
-//! lowering for the matmuls we know are problematic.
-//!
-//! The CUDA implementation is a textbook 2D-blocked SGEMM:
-//!   * 16×16 output tile per block (256 threads)
-//!   * Tiled load of A and B into shared memory in K-size chunks
-//!   * Each thread accumulates one output element across all K-tiles
-//!   * Optional bias broadcast along the M axis at write-out
-//!   * `transpose_b` toggles between row-major B `(K, N)` and row-major
-//!     B `(N, K)` (i.e. the `A @ Bᵀ` pattern that linear/projection
-//!     layers use).
-
-use std::sync::Arc;
-
-use cudarc::driver::{CudaFunction, CudaModule, CudaSlice, CudaStream};
-use luminal::{
-    dtype::DType, op::CustomOp, op::LLIROp, prelude::FxHashMap, prelude::GraphTensor,
-    shape::Expression,
-};
-
-use crate::compile_module_image_for_current_device;
-use crate::kernel::KernelOp;
-
-/// Direct 2D matmul `(M, K) × {(K, N) | (N, K)} → (M, N)` with optional
-/// per-output-column bias and an optional batch axis. A and output are
-/// always F32. B can be F32 or BF16; BF16 is converted to F32 on each
-/// load, which avoids materializing the cast as a separate intermediate
-/// tensor (important for the text encoder / transformer where the F32-
-/// cast weights would not fit in GPU memory). All shape parameters are
-/// static (baked into the CUDA source via #defines).
-///
-/// When `batch > 1` the kernel does `batch` independent 2D matmuls in
-/// parallel: A is `(batch, M, K)`, B is `(batch, *, *)` with the same
-/// per-batch shape, output is `(batch, M, N)`. All three are assumed
-/// contiguous row-major across batches (i.e. `a_batch_stride = M*K`,
-/// `b_batch_stride = K*N` or `N*K` depending on `transpose_b`,
-/// `out_batch_stride = M*N`). Bias does NOT have a batch axis — it's
-/// `(N,)` and broadcast across batches.
-#[derive(Debug, Clone)]
-pub struct Matmul2DKernel {
-    pub m: usize,
-    pub n: usize,
-    pub k: usize,
-    pub batch: usize,
-    /// If `true`, B is interpreted as `(N, K)` row-major and accessed as
-    /// `B[n][k]` (i.e. `A @ Bᵀ`). If `false`, B is `(K, N)` row-major and
-    /// accessed as `B[k][n]` (i.e. `A @ B`).
-    pub transpose_b: bool,
-    pub has_bias: bool,
-    /// Storage dtype of B. Currently F32 or BF16 are supported.
-    pub weight_dtype: DType,
-}
-
-const TILE: usize = 16;
-
-impl KernelOp for Matmul2DKernel {
-    fn compile(
-        &self,
-        stream: &Arc<CudaStream>,
-        compile_cache: &mut FxHashMap<String, (Arc<CudaModule>, CudaFunction)>,
-    ) -> (
-        CudaFunction,
-        Arc<CudaModule>,
-        String,
-        (Expression, Expression, Expression),
-        (Expression, Expression, Expression),
-        Expression,
-        FxHashMap<char, CudaSlice<u8>>,
-    ) {
-        let bias_param = if self.has_bias {
-            ", const float* __restrict__ bias"
-        } else {
-            ""
-        };
-        let bias_add = if self.has_bias {
-            "    acc += bias[n];\n"
-        } else {
-            ""
-        };
-        // We want Bs[ty][tx] = B_effective[k0+ty][b_n_base+tx] where:
-        //   transpose_b=false: B is (K, N) row-major → B[(k0+ty)*N + (b_n_base+tx)]
-        //   transpose_b=true:  B is (N, K) row-major → B[(b_n_base+tx)*K + (k0+ty)]
-        // Plus the per-batch offset (`b_batch_off`).
-        let b_index_expr = if self.transpose_b {
-            "b_batch_off + (b_n_base + tx) * K + (k0 + ty)"
-        } else {
-            "b_batch_off + (k0 + ty) * N + (b_n_base + tx)"
-        };
-        // Convert B's element to float on load. For BF16 we declare B as
-        // `__nv_bfloat16*` and use `__bfloat162float`; for F32 it's a no-op.
-        let (b_param_type, b_load_expr, bf16_include) = match self.weight_dtype {
-            DType::F32 => (
-                "const float* __restrict__ B",
-                format!("B[{b_index_expr}]"),
-                "",
-            ),
-            DType::Bf16 => (
-                "const __nv_bfloat16* __restrict__ B",
-                format!("__bfloat162float(B[{b_index_expr}])"),
-                "#include <cuda_bf16.h>\n",
-            ),
-            other => panic!("Matmul2DKernel: unsupported weight_dtype {other:?}"),
-        };
-
-        let kernel = format!(
-            "
-{bf16_include}extern \"C\" __global__ void matmul_2d_kernel(
-    float* __restrict__ C,
-    const float* __restrict__ A,
-    {b_param_type}{bias_param}
-) {{
-    const int M = {m};
-    const int N = {n};
-    const int K = {k};
-    const int TILE = {tile};
-
-    __shared__ float As[{tile}][{tile}];
-    __shared__ float Bs[{tile}][{tile}];
-
-    int bx = blockIdx.x;  // tile column (n)
-    int by = blockIdx.y;  // tile row (m)
-    int batch = blockIdx.z; // batch index (0..BATCH-1)
-    int tx = threadIdx.x; // 0..TILE-1, output col within tile
-    int ty = threadIdx.y; // 0..TILE-1, output row within tile
-
-    int m_global = by * TILE + ty;
-    int n_global = bx * TILE + tx;
-
-    int a_m_base = by * TILE;
-    int b_n_base = bx * TILE;
-
-    // Per-batch base pointer offsets (contiguous row-major across batches).
-    int a_batch_off = batch * (M * K);
-    int b_batch_off = batch * (K * N);
-    int c_batch_off = batch * (M * N);
-
-    float acc = 0.0f;
-
-    int n_tiles = (K + TILE - 1) / TILE;
-    for (int t = 0; t < n_tiles; ++t) {{
-        int k0 = t * TILE;
-
-        // Load A tile (TILE, TILE) row-major from A[m, k]: A[(by*TILE+ty)*K + (k0+tx)]
-        int a_m = a_m_base + ty;
-        int a_k = k0 + tx;
-        As[ty][tx] = (a_m < M && a_k < K) ? A[a_batch_off + a_m * K + a_k] : 0.0f;
-
-        // Load B tile depending on transpose_b
-        int b_n_or_k = b_n_base + tx;  // for transpose_b=true this is N; for =false this is N
-        int b_k_or_k = k0 + ty;        // similarly
-        // We compute Bs[ty][tx] such that the inner loop reads Bs[k_local][n_local] = B[k][n].
-        // For transpose_b=true (B is (N,K)):  B[k][n] in math = B_storage[n][k] = B[(b_n_base+tx)*K + (k0+ty)]
-        // For transpose_b=false (B is (K,N)): B[k][n] in math = B_storage[k][n] = B[(k0+ty)*N + (b_n_base+tx)]
-        bool b_in_bounds = ({transpose_b} ? (b_n_or_k < N && b_k_or_k < K)
-                                          : (b_k_or_k < K && b_n_or_k < N));
-        Bs[ty][tx] = b_in_bounds ? ({b_load_expr}) : 0.0f;
-
-        __syncthreads();
-
-        #pragma unroll
-        for (int kk = 0; kk < {tile}; ++kk) {{
-            acc += As[ty][kk] * Bs[kk][tx];
-        }}
-        __syncthreads();
-    }}
-
-    if (m_global < M && n_global < N) {{
-        int n = n_global;
-{bias_add}        C[c_batch_off + m_global * N + n_global] = acc;
-    }}
-}}
-",
-            m = self.m,
-            n = self.n,
-            k = self.k,
-            tile = TILE,
-            transpose_b = self.transpose_b,
-            b_load_expr = b_load_expr,
-            b_param_type = b_param_type,
-            bias_param = bias_param,
-            bias_add = bias_add,
-            bf16_include = bf16_include,
-        );
-
-        let (module, func) = if let Some((m, f)) = compile_cache.get(&kernel) {
-            (m.clone(), f.clone())
-        } else {
-            let ptx = compile_module_image_for_current_device(stream.context(), &kernel).unwrap();
-            let module = stream.context().load_module(ptx).unwrap();
-            let func = module.load_function("matmul_2d_kernel").unwrap();
-            compile_cache.insert(kernel.clone(), (module.clone(), func.clone()));
-            (module, func)
-        };
-
-        let grid_x = self.n.div_ceil(TILE);
-        let grid_y = self.m.div_ceil(TILE);
-        (
-            func,
-            module,
-            kernel,
-            (
-                Expression::from(grid_x),
-                Expression::from(grid_y),
-                Expression::from(self.batch),
-            ),
-            (
-                Expression::from(TILE),
-                Expression::from(TILE),
-                Expression::from(1usize),
-            ),
-            Expression::from(0usize),
-            FxHashMap::default(),
-        )
-    }
-
-    fn output_size(&self) -> Expression {
-        Expression::from(self.batch * self.m * self.n)
-    }
-
-    fn output_bytes(&self) -> Expression {
-        self.output_size() * 4
-    }
-
-    fn output_dtype(&self) -> DType {
-        DType::F32
-    }
-
-    fn bytes_loaded(&self) -> Expression {
-        // K elements from A (F32) + K elements from B (F32 or BF16) + maybe bias (F32).
-        let b_bytes = match self.weight_dtype {
-            DType::F32 => 4,
-            DType::Bf16 => 2,
-            _ => 4,
-        };
-        let bias_bytes = if self.has_bias { 4 } else { 0 };
-        Expression::from(
-            self.batch * self.m * self.n * (self.k * 4 + self.k * b_bytes + bias_bytes),
-        )
-    }
-
-    fn bytes_stored(&self) -> Expression {
-        self.output_size() * 4
-    }
-
-    fn flops(&self) -> Expression {
-        let per_out = self.k * 2 + if self.has_bias { 1 } else { 0 };
-        Expression::from(self.batch * self.m * self.n * per_out)
-    }
-
-    fn kernel_name(&self) -> &'static str {
-        "Matmul2D"
-    }
-}
-
-/// CustomOp wrapper for [`Matmul2DKernel`].
-#[derive(Debug, Clone)]
-pub struct Matmul2DCustom(pub Matmul2DKernel);
-
-impl CustomOp for Matmul2DCustom {
-    fn to_llir_op(&self) -> LLIROp {
-        LLIROp::new::<dyn KernelOp>(Box::new(self.0.clone()) as Box<dyn KernelOp>)
-    }
-}
-
-/// `(M, K) @ (K, N) -> (M, N)` for row-major F32 inputs. No bias.
-pub fn matmul_2d(a: GraphTensor, b: GraphTensor) -> GraphTensor {
-    matmul_inner(a, b, /*transpose_b=*/ false, None)
-}
-
-/// `(M, K) @ (N, K)ᵀ -> (M, N)` for row-major F32 inputs. No bias.
-/// Use this for `A @ Bᵀ` where B is stored row-major as `(N, K)` — the
-/// pattern produced by linear / projection layers (`x @ w.t()`).
-pub fn matmul_2d_t(a: GraphTensor, b: GraphTensor) -> GraphTensor {
-    matmul_inner(a, b, /*transpose_b=*/ true, None)
-}
-
-/// Linear projection with bias: `(M, K) @ (N, K)ᵀ + bias` where bias is
-/// `(N,)`, row-major F32 throughout.
-pub fn linear_bias(a: GraphTensor, b: GraphTensor, bias: GraphTensor) -> GraphTensor {
-    matmul_inner(a, b, /*transpose_b=*/ true, Some(bias))
-}
-
-/// Mixed-precision linear (no bias): `A (F32, M, K) @ B (BF16, N, K)ᵀ → (F32, M, N)`.
-///
-/// Lowers as plain HLIR — `Cast(A, BF16) @ permute(B_bf16) → Cast(F32)`.
-/// The activation cast and output cast are tiny (M*K and M*N elements;
-/// the K=hidden weight stays BF16). The inner BF16 matmul matches the
-/// existing cublaslt rewrite rules and runs as
-/// `CUBLAS_COMPUTE_32F_FAST_16BF` — Hopper's native 2× BF16 path.
-pub fn linear_no_bias_bf16_w(a: GraphTensor, b_bf16: GraphTensor) -> GraphTensor {
-    assert_eq!(a.dtype, DType::F32, "linear_no_bias_bf16_w expects F32 A");
-    assert_eq!(
-        b_bf16.dtype,
-        DType::Bf16,
-        "linear_no_bias_bf16_w expects BF16 B"
-    );
-    let a_dims = a.dims();
-    let b_dims = b_bf16.dims();
-    assert_eq!(a_dims.len(), 2);
-    assert_eq!(b_dims.len(), 2);
-    let a_bf16 = a.cast(DType::Bf16);
-    let b_kn = b_bf16.permute((1, 0));
-    a_bf16.matmul(b_kn).cast(DType::F32)
-}
-
-/// Batched matmul: `A (B, M, K) @ B (B, K, N) → (B, M, N)`, all F32 row-major.
-pub fn matmul_3d(a: GraphTensor, b: GraphTensor) -> GraphTensor {
-    matmul_inner(a, b, /*transpose_b=*/ false, None)
-}
-
-/// Batched matmul with B-transpose: `A (B, M, K) @ B (B, N, K)ᵀ → (B, M, N)`.
-pub fn matmul_3d_t(a: GraphTensor, b: GraphTensor) -> GraphTensor {
-    matmul_inner(a, b, /*transpose_b=*/ true, None)
-}
-
-fn matmul_inner(
-    a: GraphTensor,
-    b: GraphTensor,
-    transpose_b: bool,
-    bias: Option<GraphTensor>,
-) -> GraphTensor {
-    assert_eq!(a.dtype, DType::F32, "matmul requires F32 A");
-    let weight_dtype = b.dtype;
-    assert!(
-        matches!(weight_dtype, DType::F32 | DType::Bf16),
-        "matmul B must be F32 or BF16, got {weight_dtype:?}",
-    );
-    let a_dims = a.dims();
-    let b_dims = b.dims();
-    assert_eq!(
-        a_dims.len(),
-        b_dims.len(),
-        "matmul A/B rank mismatch: {} vs {}",
-        a_dims.len(),
-        b_dims.len(),
-    );
-    assert!(
-        a_dims.len() == 2 || a_dims.len() == 3,
-        "matmul expects rank 2 or 3, got rank {}",
-        a_dims.len(),
-    );
-
-    let (batch, a_off) = if a_dims.len() == 3 {
-        let ba = a_dims[0].to_usize().expect("batch dim must be static");
-        let bb = b_dims[0].to_usize().expect("batch dim must be static");
-        assert_eq!(
-            ba, bb,
-            "matmul batch dim mismatch: A batch={ba}, B batch={bb}"
-        );
-        (ba, 1)
-    } else {
-        (1, 0)
-    };
-
-    let m = a_dims[a_off].to_usize().expect("M must be a static dim");
-    let k_a = a_dims[a_off + 1]
-        .to_usize()
-        .expect("K (A) must be a static dim");
-    let (n, k_b) = if transpose_b {
-        // B per-batch is (N, K)
-        let n = b_dims[a_off].to_usize().expect("N must be a static dim");
-        let k = b_dims[a_off + 1]
-            .to_usize()
-            .expect("K (B) must be a static dim");
-        (n, k)
-    } else {
-        // B per-batch is (K, N)
-        let k = b_dims[a_off]
-            .to_usize()
-            .expect("K (B) must be a static dim");
-        let n = b_dims[a_off + 1]
-            .to_usize()
-            .expect("N must be a static dim");
-        (n, k)
-    };
-    assert_eq!(k_a, k_b, "matmul K mismatch: A K={k_a}, B K={k_b}");
-    let k = k_a;
-
-    let has_bias = bias.is_some();
-    if let Some(bias) = bias {
-        let bdims = bias.dims();
-        assert_eq!(bdims.len(), 1, "matmul bias must be 1D");
-        assert_eq!(
-            bdims[0].to_usize().expect("bias dim must be static"),
-            n,
-            "matmul bias size must equal N"
-        );
-        assert_eq!(bias.dtype, DType::F32, "matmul bias must be F32");
-    }
-
-    let kern = Matmul2DKernel {
-        m,
-        n,
-        k,
-        batch,
-        transpose_b,
-        has_bias,
-        weight_dtype,
-    };
-    let cx = unsafe { &mut *a.graph_ref };
-    let inputs: Vec<GraphTensor> = if let Some(bias) = bias {
-        vec![a, b, bias]
-    } else {
-        vec![a, b]
-    };
-    if batch == 1 {
-        cx.custom_op(Matmul2DCustom(kern), inputs, (m, n), DType::F32)
-    } else {
-        cx.custom_op(Matmul2DCustom(kern), inputs, (batch, m, n), DType::F32)
-    }
-}
--- a/crates/luminal_cuda_lite/src/kernel/mod.rs
+++ b/crates/luminal_cuda_lite/src/kernel/mod.rs
@@ -9,21 +9,12 @@ use luminal_tracing::schema::{
 };
 use uuid::Uuid;

-pub mod conv2d;
 pub mod cuda_graph;
 pub mod fusion;
 pub mod hlir;
-pub mod matmul2d;
 pub mod other_ops;
-pub mod rope;

-pub use conv2d::{Conv2DCustom, Conv2DKernel, conv2d_bias};
 pub use cuda_graph::*;
-pub use matmul2d::{
-    Matmul2DCustom, Matmul2DKernel, linear_bias, linear_no_bias_bf16_w, matmul_2d, matmul_2d_t,
-    matmul_3d, matmul_3d_t,
-};
-pub use rope::{RoPECustom, RoPEKernel, apply_rope};

 pub type Ops = (hlir::Ops, other_ops::Ops, fusion::Ops);

--- a/crates/luminal_cuda_lite/src/kernel/other_ops.rs
+++ b/crates/luminal_cuda_lite/src/kernel/other_ops.rs
@@ -23,6 +23,8 @@ pub type Ops = (
    KernelBatchMatMul,
    KernelScatterNoCopy,
    KernelSoftmax,
+    KernelExp,
+    KernelSigmoid,
 );

 #[derive(Default, Debug, Clone)]
@@ -126,8 +128,7 @@ impl KernelOp for KernelMeanReduce {
        let dtype = cuda_dtype(self.dtype);
        let includes = dtype_includes(&[self.dtype]);
        let n_outputs: Expression = self.out_shape.iter().copied().product();
-        let threads_per_block: usize = 256; // 8 warps per block
-        let n_warps = threads_per_block / 32;
+        let threads_per_block = 256; // 8 warps per block
        let (dyn_defines, _sorted_dims) = generate_dyn_dims_defines(&vars);
        let dyn_dims_param = if vars.is_empty() {
            ""
@@ -148,24 +149,12 @@ extern \"C\" {{
        long long iters = {iters};
        long long iter_stride = {iter_stride};

-        float thread_sum = 0.0f;
-        for (long long i = threadIdx.x; i < iters; i += {threads_per_block})
-            thread_sum += (float)in[in_start + i * iter_stride];
-
-        for (int offset = 16; offset > 0; offset >>= 1)
-            thread_sum += __shfl_down_sync(0xffffffff, thread_sum, offset);
-
-        __shared__ float warp_sums[{n_warps}];
-        int lane = threadIdx.x & 31;
-        int warp = threadIdx.x >> 5;
-        if (lane == 0) warp_sums[warp] = thread_sum;
-        __syncthreads();
-
-        if (threadIdx.x == 0) {{
-            float sum = 0.0f;
-            for (int w = 0; w < {n_warps}; w++) sum += warp_sums[w];
-            out[{out_index}] = ({dtype})(sum / (float)iters);
+        {dtype} sum = 0;
+        for (long long i = 0; i < iters; i++) {{
+            sum += in[in_start + i * iter_stride];
        }}
+
+        out[{out_index}] = ({dtype})(sum / ({dtype})iters);
    }}
 }}",
            dtype = dtype,
@@ -178,8 +167,6 @@ extern \"C\" {{
                .substitute('z', Expression::from(1))
                .simplify()
                .to_kernel(),
-            threads_per_block = threads_per_block,
-            n_warps = n_warps,
        );

        let (module, func) = if let Some((module, func)) = compile_cache.get(&kernel) {
@@ -196,9 +183,9 @@ extern \"C\" {{
            func,
            module,
            kernel,
-            (n_outputs, 1.into(), 1.into()),                // grid
-            (threads_per_block.into(), 1.into(), 1.into()), // block
-            0.into(),                                       // shmem size
+            (n_outputs, 1.into(), 1.into()), // grid
+            (1.into(), 1.into(), 1.into()),  // blocks (single-threaded)
+            0.into(),                        // shmem size
            FxHashMap::default(),
        )
    }
@@ -292,9 +279,6 @@ impl EgglogOp for KernelScatterNoCopy {
    fn rewrites(&self) -> Vec<Rule> {
        // Match KernelScatter and rewrite to KernelScatterNoCopy with ConsumedBuffer on dest.
        // ConsumedBuffer wraps dest to signal in-place modification.
-        // This is only valid when the destination buffer can also represent
-        // the scatter output layout. If dest is a strided/broadcast view,
-        // regular Scatter must first materialize a contiguous output copy.
        //
        // Two-phase resolution:
        // 1. During (run): cleanup rules delete ConsumedBuffer if dest is shared (another op uses it)
@@ -305,31 +289,12 @@ impl EgglogOp for KernelScatterNoCopy {
        // If ConsumedBuffer was deleted (shared case), cascade cleanup removes the dependent
        // ICons and KernelScatterNoCopy Op, leaving only KernelScatter.
        let mut rules = vec![
-            Rule::raw("(relation consumed_buffer_ilist_contains (IList IR))"),
-            Rule::raw(
-                "(rule
-                    ((= ?list (ICons ?head ?tail)))
-                    ((consumed_buffer_ilist_contains ?list ?head))
-                    :ruleset cleanup
-                    :name \"consumed-buffer-ilist-contains-head\"
-                )",
-            ),
-            Rule::raw(
-                "(rule
-                    ((= ?list (ICons ?head ?tail))
-                     (consumed_buffer_ilist_contains ?tail ?item))
-                    ((consumed_buffer_ilist_contains ?list ?item))
-                    :ruleset cleanup
-                    :name \"consumed-buffer-ilist-contains-tail\"
-                )",
-            ),
            // Rewrite: KernelScatter -> KernelScatterNoCopy with ConsumedBuffer
            Rule::raw(
                "(rule
                    (
                        (= ?scatter (Op (KernelScatter ?ds ?dst ?is ?istr ?ss ?os ?dt)
                            (ICons ?dest (ICons ?indexes (ICons ?src (INil))))))
-                        (= ?dst ?os)
                        (= ?dty (dtype ?src))
                    )
                    (
@@ -339,7 +304,6 @@ impl EgglogOp for KernelScatterNoCopy {
                        (union ?scatter ?nocopy)
                        (set (dtype ?nocopy) ?dty)
                    )
-                    :ruleset buffer_reuse
                    :name \"scatter to scatter-no-copy\"
                )",
            ),
@@ -349,7 +313,6 @@ impl EgglogOp for KernelScatterNoCopy {
                    ((= ?cb (ConsumedBuffer ?a))
                     (= ?dt (dtype ?a)))
                    ((set (dtype ?cb) ?dt))
-                    :ruleset dtype_prop
                    :name \"consumed-buffer-dtype\"
                )",
            ),
@@ -359,28 +322,13 @@ impl EgglogOp for KernelScatterNoCopy {
            "(rule
                ((= ?cb (ConsumedBuffer ?a))
                 (= ?op1 (Op ?k1 ?ilist1))
-                 (consumed_buffer_ilist_contains ?ilist1 ?cb)
+                 (= ?ilist1 (ICons ?cb ?rest1))
                 (= ?op2 (Op ?k2 ?ilist2))
                 (!= ?op1 ?op2)
-                 (consumed_buffer_ilist_contains ?ilist2 ?a))
+                 (= ?ilist2 (ICons ?a ?t2)))
                ((delete (ConsumedBuffer ?a)))
                :ruleset cleanup
-                :name \"consumed-buffer-cleanup-shared-op-use\"
-            )",
-        ));
-        // If a valid no-copy scatter survives cleanup, it dominates the copying scatter.
-        // This must run before base_cleanup resolves ConsumedBuffer back to the destination.
-        rules.push(Rule::raw(
-            "(rule
-                ((= ?cb (ConsumedBuffer ?dest))
-                 (= ?scatter (Op (KernelScatter ?ds ?dst ?is ?istr ?ss ?os ?dt)
-                     (ICons ?dest (ICons ?indexes (ICons ?src (INil))))))
-                 (= ?nocopy (Op (KernelScatterNoCopy ?ds ?dst ?is ?istr ?ss ?os ?dt)
-                     (ICons ?cb (ICons ?indexes (ICons ?src (INil)))))))
-                ((delete (Op (KernelScatter ?ds ?dst ?is ?istr ?ss ?os ?dt)
-                     (ICons ?dest (ICons ?indexes (ICons ?src (INil)))))))
-                :ruleset post_cleanup
-                :name \"scatter-no-copy-dominates-valid-consumed-buffer\"
+                :name \"consumed-buffer-cleanup-pos\"
            )",
        ));
        // Surviving ConsumedBuffers are valid — union with source and delete.
@@ -507,8 +455,8 @@ extern \"C\" {{
            func,
            module,
            scatter_kernel,
-            (n_src.ceil_div(256), 1.into(), 1.into()),
-            (256.into(), 1.into(), 1.into()),
+            (n_src, 1.into(), 1.into()),
+            (1.into(), 1.into(), 1.into()),
            0.into(),
            FxHashMap::default(),
        )
@@ -623,7 +571,7 @@ extern \"C\" {{
 // KernelBatchMatVec: Fused batched matrix-vector product for attention
 // Matches: Mul(broadcast) + Sum pattern for [B, 1, K] x [B, K, N] -> [B, 1, N]
 // or [B, M, K] x [B, K, N] -> [B, M, N] with small M
-// Replaces the broadcast elementwise Mul + single-threaded KernelSumReduce pipeline
+// Replaces the broadcast KernelMul + single-threaded KernelSumReduce pipeline
 // =============================================================================

 #[derive(Default, Debug, Clone)]
@@ -711,7 +659,6 @@ impl EgglogOp for KernelBatchMatVec {
                    (union ?sum ?bmv)
                    (set (dtype ?bmv) (F32))
                )
-                :ruleset matmul_backend
                :name \"batch mat-vec\"
            )"
        )]
@@ -992,7 +939,6 @@ impl EgglogOp for KernelBatchMatMul {
                    (union ?sum ?bmm)
                    (set (dtype ?bmm) (F32))
                )
-                :ruleset matmul_backend
                :name \"batch matmul\"
            )"
        )]
@@ -1232,7 +1178,6 @@ impl EgglogOp for KernelSoftmax {
                    (union ?sm ?ksm)
                    (set (dtype ?ksm) (F32))
                )
-                :ruleset kernel_lower
                :name \"softmax-to-kernel-f32\"
            )",
            ),
@@ -1454,3 +1399,370 @@ extern \"C\" {{
        "Softmax"
    }
 }
+
+// KernelExp: native exp (uses expf instead of exp2f * constant)
+// Single-kernel alternative to the 3-kernel Constant+Mul+Exp2 path.
+// Improves numerical precision by avoiding the truncated log2(e) constant.
+
+#[derive(Default, Debug, Clone)]
+pub struct KernelExp {
+    shape: Vec<Expression>,
+    in_strides: Vec<Expression>,
+    out_strides: Vec<Expression>,
+    dtype: DType,
+}
+
+impl EgglogOp for KernelExp {
+    fn sort(&self) -> SortDef {
+        sort(
+            OP_KIND,
+            "KernelExp",
+            &[
+                ("shape", ELIST),
+                ("strides", ELIST),
+                ("out_strides", ELIST),
+                ("dtype", DTYPE),
+            ],
+        )
+    }
+
+    fn n_inputs(&self) -> usize {
+        1
+    }
+
+    fn rewrites(&self) -> Vec<Rule> {
+        vec![
+            // Match Exp2(Mul(x, log2e_constant)) directly.
+            // This matches the pattern created by frontend exp() = (self * (1/ln(2))).exp2()
+            Rule::raw(
+                "(rule
+                (
+                    (= ?mul (Op (Mul ?shape ?x_stride ?const_stride ?inter_stride) (ICons ?x (ICons ?exp_const (INil)))))
+                    (= ?exp2 (Op (Exp2 ?shape ?inter_stride ?out_stride) (ICons ?mul (INil))))
+                    (= ?dt (dtype ?x))
+                    (= ?cv (Op (Constant ?val) (INil)))
+                    (= ?exp_const ?cv)
+                    (> ?val 1.44)
+                    (< ?val 1.45)
+                )
+                (
+                    (let ?kexp (Op (KernelExp ?shape ?x_stride ?out_stride ?dt) (ICons ?x (INil))))
+                    (union ?exp2 ?kexp)
+                    (set (dtype ?kexp) ?dt)
+                )
+                :name \"direct-exp-fusion\"
+            )",
+            ),
+        ]
+    }
+
+    fn cleanup(&self) -> bool {
+        false
+    }
+
+    fn extract<'a>(
+        &'a self,
+        egraph: &'a SerializedEGraph,
+        kind_children: &[&'a ENodeId],
+        input_enodes: Vec<&'a ENodeId>,
+        list_cache: &mut FxHashMap<&'a ENodeId, Vec<Expression>>,
+        expr_cache: &mut FxHashMap<&'a ENodeId, Expression>,
+    ) -> (LLIROp, Vec<&'a ENodeId>) {
+        (
+            LLIROp::new::<dyn KernelOp>(Box::new(Self {
+                shape: extract_expr_list(egraph, kind_children[0], list_cache, expr_cache).unwrap(),
+                in_strides: extract_expr_list(egraph, kind_children[1], list_cache, expr_cache)
+                    .unwrap(),
+                out_strides: extract_expr_list(egraph, kind_children[2], list_cache, expr_cache)
+                    .unwrap(),
+                dtype: extract_dtype(egraph, kind_children[3]),
+            })),
+            input_enodes,
+        )
+    }
+}
+
+impl KernelOp for KernelExp {
+    fn compile(
+        &self,
+        stream: &Arc<CudaStream>,
+        compile_cache: &mut FxHashMap<String, (Arc<CudaModule>, CudaFunction)>,
+    ) -> (
+        CudaFunction,
+        Arc<CudaModule>,
+        String,
+        (Expression, Expression, Expression),
+        (Expression, Expression, Expression),
+        Expression,
+        FxHashMap<char, CudaSlice<u8>>,
+    ) {
+        let vars = self
+            .shape
+            .iter()
+            .flat_map(|e| e.dyn_vars())
+            .chain(self.in_strides.iter().flat_map(|e| e.dyn_vars()))
+            .chain(self.out_strides.iter().flat_map(|e| e.dyn_vars()))
+            .collect::<FxHashSet<_>>();
+        let dtype = cuda_dtype(self.dtype);
+        let includes = dtype_includes(&[self.dtype]);
+        let (dyn_defines, _sorted_dims) = generate_dyn_dims_defines(&vars);
+        let dyn_dims_param = if vars.is_empty() {
+            ""
+        } else {
+            ", const int* dyn_dims"
+        };
+        let n_elements = self
+            .shape
+            .iter()
+            .copied()
+            .product::<Expression>()
+            .to_kernel();
+        let out_idx = flatten_strides(&self.shape, &self.out_strides).to_kernel();
+        let in_idx = flatten_strides(&self.shape, &self.in_strides).to_kernel();
+        let kernel = format!(
+            "{includes}
+{dyn_defines}
+extern \"C\" {{
+    __global__ void exp_k({dtype} *out, const {dtype} *in{dyn_dims_param}) {{
+        long long const_z = (long long)blockIdx.x * blockDim.x + threadIdx.x;
+        if (const_z >= {n_elements}) return;
+        out[{out_idx}] = expf(in[{in_idx}]);
+    }}
+}}"
+        );
+        let (module, func) = if let Some((module, func)) = compile_cache.get(&kernel) {
+            (module.clone(), func.clone())
+        } else {
+            let ptx = compile_module_image_for_current_device(stream.context(), &kernel).unwrap();
+            let module = stream.context().load_module(ptx).unwrap();
+            let func = module.load_function("exp_k").unwrap();
+            compile_cache.insert(kernel.clone(), (module.clone(), func.clone()));
+            (module, func)
+        };
+        let out_size = self.shape.iter().copied().product::<Expression>();
+        (
+            func,
+            module,
+            kernel,
+            (out_size.ceil_div(256), 1.into(), 1.into()),
+            (out_size.min(256), 1.into(), 1.into()),
+            0.into(),
+            FxHashMap::default(),
+        )
+    }
+
+    fn output_size(&self) -> Expression {
+        self.shape.iter().copied().product()
+    }
+
+    fn output_bytes(&self) -> Expression {
+        (self.output_size() * self.dtype.bits()).ceil_div(8)
+    }
+
+    fn bytes_loaded(&self) -> Expression {
+        self.output_bytes()
+    }
+
+    fn bytes_stored(&self) -> Expression {
+        self.output_bytes()
+    }
+
+    fn flops(&self) -> Expression {
+        self.shape.iter().copied().product()
+    }
+
+    fn output_dtype(&self) -> DType {
+        self.dtype
+    }
+
+    fn kernel_name(&self) -> &'static str {
+        "Exp"
+    }
+}
+
+// KernelSigmoid: fused sigmoid = 1/(1+exp(-x))
+// Single-kernel alternative to the 5-kernel Neg+Exp+Const+Add+Recip path.
+
+#[derive(Default, Debug, Clone)]
+pub struct KernelSigmoid {
+    shape: Vec<Expression>,
+    in_strides: Vec<Expression>,
+    out_strides: Vec<Expression>,
+    dtype: DType,
+}
+
+impl EgglogOp for KernelSigmoid {
+    fn sort(&self) -> SortDef {
+        sort(
+            OP_KIND,
+            "KernelSigmoid",
+            &[
+                ("shape", ELIST),
+                ("strides", ELIST),
+                ("out_strides", ELIST),
+                ("dtype", DTYPE),
+            ],
+        )
+    }
+
+    fn n_inputs(&self) -> usize {
+        1
+    }
+
+    fn rewrites(&self) -> Vec<Rule> {
+        vec![
+            // Match the HLIR pattern directly: Recip(Add(Exp2(Mul(Mul(x, -1), log2e)), 1))
+            Rule::raw(
+                "(rule
+                (
+                    (= ?neg1 (Op (Constant ?nv) (INil)))
+                    (< ?nv -0.99)
+                    (> ?nv -1.01)
+                    (= ?neg_x (Op (Mul ?shape ?x_stride ?neg_stride ?neg_out_stride) (ICons ?x (ICons ?neg1 (INil)))))
+                    (= ?log2e (Op (Constant ?lv) (INil)))
+                    (> ?lv 1.44)
+                    (< ?lv 1.45)
+                    (= ?scaled (Op (Mul ?shape ?neg_out_stride ?log2e_stride ?scaled_stride) (ICons ?neg_x (ICons ?log2e (INil)))))
+                    (= ?exp2 (Op (Exp2 ?shape ?scaled_stride ?exp_stride) (ICons ?scaled (INil))))
+                    (= ?one (Op (Constant ?ov) (INil)))
+                    (> ?ov 0.99)
+                    (< ?ov 1.01)
+                    (= ?plus_one (Op (Add ?shape ?exp_stride ?one_stride ?add_stride) (ICons ?exp2 (ICons ?one (INil)))))
+                    (= ?sig_out (Op (Recip ?shape ?add_stride ?out_stride) (ICons ?plus_one (INil))))
+                    (= ?dt (dtype ?x))
+                )
+                (
+                    (let ?ksig (Op (KernelSigmoid ?shape ?x_stride ?out_stride ?dt) (ICons ?x (INil))))
+                    (union ?sig_out ?ksig)
+                    (set (dtype ?ksig) ?dt)
+                )
+                :name \"direct-sigmoid-fusion\"
+            )",
+            ),
+        ]
+    }
+
+    fn cleanup(&self) -> bool {
+        false
+    }
+
+    fn extract<'a>(
+        &'a self,
+        egraph: &'a SerializedEGraph,
+        kind_children: &[&'a ENodeId],
+        input_enodes: Vec<&'a ENodeId>,
+        list_cache: &mut FxHashMap<&'a ENodeId, Vec<Expression>>,
+        expr_cache: &mut FxHashMap<&'a ENodeId, Expression>,
+    ) -> (LLIROp, Vec<&'a ENodeId>) {
+        (
+            LLIROp::new::<dyn KernelOp>(Box::new(Self {
+                shape: extract_expr_list(egraph, kind_children[0], list_cache, expr_cache).unwrap(),
+                in_strides: extract_expr_list(egraph, kind_children[1], list_cache, expr_cache)
+                    .unwrap(),
+                out_strides: extract_expr_list(egraph, kind_children[2], list_cache, expr_cache)
+                    .unwrap(),
+                dtype: extract_dtype(egraph, kind_children[3]),
+            })),
+            input_enodes,
+        )
+    }
+}
+
+impl KernelOp for KernelSigmoid {
+    fn compile(
+        &self,
+        stream: &Arc<CudaStream>,
+        compile_cache: &mut FxHashMap<String, (Arc<CudaModule>, CudaFunction)>,
+    ) -> (
+        CudaFunction,
+        Arc<CudaModule>,
+        String,
+        (Expression, Expression, Expression),
+        (Expression, Expression, Expression),
+        Expression,
+        FxHashMap<char, CudaSlice<u8>>,
+    ) {
+        let vars = self
+            .shape
+            .iter()
+            .flat_map(|e| e.dyn_vars())
+            .chain(self.in_strides.iter().flat_map(|e| e.dyn_vars()))
+            .chain(self.out_strides.iter().flat_map(|e| e.dyn_vars()))
+            .collect::<FxHashSet<_>>();
+        let dtype = cuda_dtype(self.dtype);
+        let includes = dtype_includes(&[self.dtype]);
+        let (dyn_defines, _sorted_dims) = generate_dyn_dims_defines(&vars);
+        let dyn_dims_param = if vars.is_empty() {
+            ""
+        } else {
+            ", const int* dyn_dims"
+        };
+        let n_elements = self
+            .shape
+            .iter()
+            .copied()
+            .product::<Expression>()
+            .to_kernel();
+        let out_idx = flatten_strides(&self.shape, &self.out_strides).to_kernel();
+        let in_idx = flatten_strides(&self.shape, &self.in_strides).to_kernel();
+        let kernel = format!(
+            "{includes}
+{dyn_defines}
+extern \"C\" {{
+    __global__ void sigmoid_k({dtype} *out, const {dtype} *in{dyn_dims_param}) {{
+        long long const_z = (long long)blockIdx.x * blockDim.x + threadIdx.x;
+        if (const_z >= {n_elements}) return;
+        out[{out_idx}] = 1.0f / (1.0f + expf(-in[{in_idx}]));
+    }}
+}}"
+        );
+        let (module, func) = if let Some((module, func)) = compile_cache.get(&kernel) {
+            (module.clone(), func.clone())
+        } else {
+            let ptx = compile_module_image_for_current_device(stream.context(), &kernel).unwrap();
+            let module = stream.context().load_module(ptx).unwrap();
+            let func = module.load_function("sigmoid_k").unwrap();
+            compile_cache.insert(kernel.clone(), (module.clone(), func.clone()));
+            (module, func)
+        };
+        let out_size = self.shape.iter().copied().product::<Expression>();
+        (
+            func,
+            module,
+            kernel,
+            (out_size.ceil_div(256), 1.into(), 1.into()),
+            (out_size.min(256), 1.into(), 1.into()),
+            0.into(),
+            FxHashMap::default(),
+        )
+    }
+
+    fn output_size(&self) -> Expression {
+        self.shape.iter().copied().product()
+    }
+
+    fn output_bytes(&self) -> Expression {
+        (self.output_size() * self.dtype.bits()).ceil_div(8)
+    }
+
+    fn bytes_loaded(&self) -> Expression {
+        self.output_bytes()
+    }
+
+    fn bytes_stored(&self) -> Expression {
+        self.output_bytes()
+    }
+
+    fn flops(&self) -> Expression {
+        // neg + exp + add + recip = ~4 ops per element
+        self.shape.iter().copied().product::<Expression>() * 4
+    }
+
+    fn output_dtype(&self) -> DType {
+        self.dtype
+    }
+
+    fn kernel_name(&self) -> &'static str {
+        "Sigmoid"
+    }
+}
--- a/crates/luminal_cuda_lite/src/kernel/rope.rs
+++ b/crates/luminal_cuda_lite/src/kernel/rope.rs
@@ -1,189 +0,0 @@
-//! Fused RoPE (rotary position embedding) — interleaved-pair convention.
-//!
-//! Replaces flux2's 6-op RoPE chain (split / slice / squeeze / neg / concat /
-//! merge_dims / 4× cast / mul / add) with a single kernel launch per call.
-//! ~120 RoPE calls per forward pass at full DiT depth.
-//!
-//! Convention: `repeat_interleave_real=True` (Flux 2 / diffusers), so adjacent
-//! dim pairs rotate together. For an input `[a0, b0, a1, b1, ...]` and per-
-//! position `(cos, sin)`, the output is
-//!   `out[2j]   = x[2j]   * cos[2j]   - x[2j+1] * sin[2j]`
-//!   `out[2j+1] = x[2j+1] * cos[2j+1] + x[2j]   * sin[2j+1]`
-//!
-//! Layout: x `(S, H, D)`, cos/sin `(S, D)` (broadcast across H).
-
-use std::sync::Arc;
-
-use cudarc::driver::{CudaFunction, CudaModule, CudaSlice, CudaStream};
-use luminal::{
-    dtype::DType, op::CustomOp, op::LLIROp, prelude::FxHashMap, prelude::GraphTensor,
-    shape::Expression,
-};
-
-use crate::compile_module_image_for_current_device;
-use crate::kernel::KernelOp;
-
-#[derive(Debug, Clone)]
-pub struct RoPEKernel {
-    pub s: usize,
-    pub h: usize,
-    pub d: usize,
-}
-
-const TPB: usize = 64;
-
-impl KernelOp for RoPEKernel {
-    fn compile(
-        &self,
-        stream: &Arc<CudaStream>,
-        compile_cache: &mut FxHashMap<String, (Arc<CudaModule>, CudaFunction)>,
-    ) -> (
-        CudaFunction,
-        Arc<CudaModule>,
-        String,
-        (Expression, Expression, Expression),
-        (Expression, Expression, Expression),
-        Expression,
-        FxHashMap<char, CudaSlice<u8>>,
-    ) {
-        let s = self.s;
-        let h = self.h;
-        let d = self.d;
-        assert!(d.is_multiple_of(2), "RoPE head_dim must be even");
-        let kernel = format!(
-            r#"
-extern "C" __global__ void rope_kernel(
-    float* __restrict__ out,
-    const float* __restrict__ x,
-    const float* __restrict__ cos_,
-    const float* __restrict__ sin_
-) {{
-    const int S = {s};
-    const int H = {h};
-    const int D = {d};
-    int sh = blockIdx.x;       // 0..S*H
-    int s_idx = sh / H;
-    int tid = threadIdx.x;
-
-    const float* xr   = x    + sh    * D;
-    const float* cosr = cos_ + s_idx * D;
-    const float* sinr = sin_ + s_idx * D;
-    float* yr = out + sh * D;
-
-    for (int i = tid; i < D; i += {TPB}) {{
-        float xi = xr[i];
-        float xpair;
-        if ((i & 1) == 0) {{
-            // even: paired with i+1, rotated value is -x[i+1]
-            xpair = -xr[i + 1];
-        }} else {{
-            // odd: paired with i-1, rotated value is +x[i-1]
-            xpair = xr[i - 1];
-        }}
-        yr[i] = xi * cosr[i] + xpair * sinr[i];
-    }}
-}}
-"#
-        );
-
-        let (module, func) = if let Some((m, f)) = compile_cache.get(&kernel) {
-            (m.clone(), f.clone())
-        } else {
-            let ptx = compile_module_image_for_current_device(stream.context(), &kernel).unwrap();
-            let module = stream.context().load_module(ptx).unwrap();
-            let func = module.load_function("rope_kernel").unwrap();
-            compile_cache.insert(kernel.clone(), (module.clone(), func.clone()));
-            (module, func)
-        };
-
-        (
-            func,
-            module,
-            "rope_kernel".to_string(),
-            (
-                Expression::from(s * h),
-                Expression::from(1usize),
-                Expression::from(1usize),
-            ),
-            (
-                Expression::from(TPB),
-                Expression::from(1usize),
-                Expression::from(1usize),
-            ),
-            Expression::from(0usize),
-            FxHashMap::default(),
-        )
-    }
-
-    fn output_size(&self) -> Expression {
-        Expression::from(self.s * self.h * self.d)
-    }
-
-    fn output_bytes(&self) -> Expression {
-        self.output_size() * 4
-    }
-
-    fn output_dtype(&self) -> DType {
-        DType::F32
-    }
-
-    fn bytes_loaded(&self) -> Expression {
-        // x: full (S,H,D); cos/sin: (S,D) read H times each but cached.
-        Expression::from(self.s * self.h * self.d * 4 + self.s * self.d * 4 * 2)
-    }
-
-    fn bytes_stored(&self) -> Expression {
-        self.output_size() * 4
-    }
-
-    fn flops(&self) -> Expression {
-        // 4 per output element (mul, neg/load, mul, add).
-        Expression::from(self.s * self.h * self.d * 4)
-    }
-
-    fn kernel_name(&self) -> &'static str {
-        "RoPE"
-    }
-}
-
-#[derive(Debug, Clone)]
-pub struct RoPECustom(pub RoPEKernel);
-
-impl CustomOp for RoPECustom {
-    fn to_llir_op(&self) -> LLIROp {
-        LLIROp::new::<dyn KernelOp>(Box::new(self.0.clone()) as Box<dyn KernelOp>)
-    }
-}
-
-/// Apply RoPE: `x` shape `(S, H, D)` F32, `cos`/`sin` shape `(S, D)` F32.
-/// Returns `(S, H, D)` F32.
-pub fn apply_rope(x: GraphTensor, cos: GraphTensor, sin: GraphTensor) -> GraphTensor {
-    assert_eq!(x.dtype, DType::F32, "RoPE x must be F32");
-    let cos = if cos.dtype == DType::F32 {
-        cos
-    } else {
-        cos.cast(DType::F32)
-    };
-    let sin = if sin.dtype == DType::F32 {
-        sin
-    } else {
-        sin.cast(DType::F32)
-    };
-    let x_dims = x.dims();
-    assert_eq!(x_dims.len(), 3, "RoPE x must be 3-D (S, H, D)");
-    let s = x_dims[0].to_usize().expect("RoPE: S must be static");
-    let h = x_dims[1].to_usize().expect("RoPE: H must be static");
-    let d = x_dims[2].to_usize().expect("RoPE: D must be static");
-    let cos_dims = cos.dims();
-    let sin_dims = sin.dims();
-    assert_eq!(cos_dims.len(), 2, "RoPE cos must be 2-D (S, D)");
-    assert_eq!(sin_dims.len(), 2, "RoPE sin must be 2-D (S, D)");
-    assert_eq!(cos_dims[0].to_usize().unwrap(), s, "RoPE cos S mismatch");
-    assert_eq!(cos_dims[1].to_usize().unwrap(), d, "RoPE cos D mismatch");
-    assert_eq!(sin_dims[0].to_usize().unwrap(), s, "RoPE sin S mismatch");
-    assert_eq!(sin_dims[1].to_usize().unwrap(), d, "RoPE sin D mismatch");
-
-    let kern = RoPEKernel { s, h, d };
-    let cx = unsafe { &mut *x.graph_ref };
-    cx.custom_op(RoPECustom(kern), vec![x, cos, sin], (s, h, d), DType::F32)
-}
--- a/crates/luminal_cuda_lite/src/kernel/to_host.rs
+++ b/crates/luminal_cuda_lite/src/kernel/to_host.rs
@@ -7,13 +7,13 @@ use std::cell::RefCell;
 use std::sync::Arc;

 use cudarc::driver::{
-    CudaFunction, CudaModule, CudaSlice, CudaStream, DevicePtr, sys::CUgraphNode,
+    CudaFunction, CudaModule, CudaSlice, CudaStream, DevicePtr,
+    sys::{CUgraphNode, CUresult, cuLaunchKernel},
 };
 use itertools::Itertools;
 use luminal::{
    egglog_utils::{api::Rule, base::OP_KIND},
    graph::LLIRGraph,
-    hlir::{LoopEnd, LoopInput, LoopInputStatic, LoopOutput, LoopOutputSelect, LoopStart},
    op::{EgglogOp, LLIROp},
    prelude::{
        petgraph::{Direction, algo::toposort, visit::EdgeRef},
@@ -23,7 +23,7 @@ use luminal::{
 use tracing::{Level, enabled, span};

 use crate::{
-    host::{DeviceBuffer, HostOp},
+    host::HostOp,
    kernel::{
        CudaFunctionExt, CudaGraphExecHandle, CudaGraphHandle, KernelOp, create_cuda_event,
        destroy_cuda_event,
@@ -48,12 +48,8 @@ struct CompiledKernel {
    shared_mem: Expression,
    /// Input node indices (for buffer lookup)
    inputs: Vec<NodeIndex>,
-    /// Human-readable labels for input nodes, for launch diagnostics.
-    input_labels: Vec<String>,
    /// Reference to the KernelOp for trait methods
    kernel_op: Arc<Box<dyn KernelOp>>,
-    /// Whether this compiled CUDA function has a trailing dyn_dims parameter.
-    has_dyn_dims_param: bool,
    /// Internal buffers allocated for this kernel
    internal_bufs: Vec<CudaSlice<u8>>,
    /// Device constants from compile()
@@ -73,9 +69,7 @@ impl CompiledKernel {
        block: (Expression, Expression, Expression),
        shared_mem: Expression,
        inputs: Vec<NodeIndex>,
-        input_labels: Vec<String>,
        kernel_op: Arc<Box<dyn KernelOp>>,
-        has_dyn_dims_param: bool,
        constants: FxHashMap<char, CudaSlice<u8>>,
        kernel_name: &'static str,
    ) -> Self {
@@ -86,9 +80,7 @@ impl CompiledKernel {
            block,
            shared_mem,
            inputs,
-            input_labels,
            kernel_op,
-            has_dyn_dims_param,
            internal_bufs: Vec::new(),
            constants,
            graph_node: None,
@@ -192,32 +184,6 @@ impl CudaGraphOp {
            state: RefCell::new(state),
        }
    }
-
-    /// LLIR node IDs of every kernel in this CudaGraphOp, in the order
-    /// they execute inside the compiled CUDA graph. This is the
-    /// toposort `kernel_to_host` used at compile time, preserved here
-    /// so the runtime can compute live ranges that match real
-    /// execution order: each kernel in `state.kernels` was added to
-    /// the CUDA graph with `prev_graph_node` as its sole dependency,
-    /// which serializes them.
-    pub fn kernel_topo_order(&self) -> Vec<NodeIndex> {
-        self.state.borrow().kernels.iter().map(|k| k.node).collect()
-    }
-
-    /// Direct LLIR-node inputs of one kernel inside this CudaGraphOp.
-    /// Used by the runtime's live-range pass to refine intra-graph
-    /// consumer positions: a kernel's input can stop being live as
-    /// soon as that specific kernel finishes, not when the whole
-    /// CudaGraphOp finishes.
-    pub fn kernel_inputs(&self, kernel_node: NodeIndex) -> Vec<NodeIndex> {
-        self.state
-            .borrow()
-            .kernels
-            .iter()
-            .find(|k| k.node == kernel_node)
-            .map(|k| k.inputs.clone())
-            .unwrap_or_default()
-    }
 }

 impl std::fmt::Debug for CudaGraphOp {
@@ -261,7 +227,7 @@ impl HostOp for CudaGraphOp {
        stream: &Arc<CudaStream>,
        _self_node: NodeIndex,
        _inputs: &[NodeIndex],
-        buffers: &FxHashMap<NodeIndex, DeviceBuffer>,
+        buffers: &FxHashMap<NodeIndex, &CudaSlice<u8>>,
        dyn_map: &FxHashMap<char, usize>,
    ) -> anyhow::Result<()> {
        self.execute_internal(stream, buffers, dyn_map)
@@ -293,40 +259,6 @@ impl HostOp for CudaGraphOp {
            .collect()
    }

-    fn extra_buffer_lifetimes(&self) -> Option<Vec<(NodeIndex, usize, usize)>> {
-        let state = self.state.borrow();
-        let mut lifetimes: FxHashMap<NodeIndex, (usize, usize)> = FxHashMap::default();
-        let max_step = state.kernels.len().saturating_sub(1);
-
-        let mut touch = |node: NodeIndex, step: usize| {
-            lifetimes
-                .entry(node)
-                .and_modify(|(first, last)| {
-                    *first = (*first).min(step);
-                    *last = (*last).max(step);
-                })
-                .or_insert((step, step));
-        };
-
-        for (step, kernel) in state.kernels.iter().enumerate() {
-            for &input in &kernel.inputs {
-                touch(input, step);
-            }
-            touch(kernel.node, step);
-        }
-
-        for node in self.extra_buffer_nodes() {
-            lifetimes.entry(node).or_insert((0, max_step));
-        }
-
-        Some(
-            lifetimes
-                .into_iter()
-                .map(|(node, (start, end))| (node, start, end))
-                .collect(),
-        )
-    }
-
    fn extra_buffer_sizes(&self) -> FxHashMap<NodeIndex, Expression> {
        self.buffer_sizes.clone()
    }
@@ -337,66 +269,21 @@ impl HostOp for CudaGraphOp {
 }

 impl CudaGraphOp {
-    fn expected_kernel_inputs(kernel_name: &str) -> Option<usize> {
-        match kernel_name {
-            "Constant" | "Iota" => Some(0),
-            "MaxReduce" | "MeanReduce" | "SumReduce" | "Cast" | "Exp" | "Exp2" | "Log2" | "Sin"
-            | "Recip" | "Sigmoid" | "Softmax" | "Sqrt" => Some(1),
-            "Add" | "BatchMatMul" | "BatchMatVec" | "Embed" | "Gather" | "LessThan" | "Mod"
-            | "Mul" => Some(2),
-            "Scatter" | "ScatterNoCopy" => Some(3),
-            _ => None,
-        }
-    }
-
-    fn kernel_requires_output_buffer(
-        kernel: &CompiledKernel,
-        dyn_map: &FxHashMap<char, usize>,
-    ) -> bool {
-        kernel.kernel_op.output_size().exec(dyn_map).unwrap_or(1) != 0
-            && kernel.kernel_op.output_aliases_input().is_none()
-    }
-
-    fn validate_kernel_pointers(
-        kernel: &CompiledKernel,
-        output_ptr: u64,
-        input_ptrs: &[u64],
-        dyn_map: &FxHashMap<char, usize>,
-    ) -> anyhow::Result<()> {
-        if Self::kernel_requires_output_buffer(kernel, dyn_map) && output_ptr == 0 {
-            anyhow::bail!(
-                "missing output buffer for CUDA kernel {} at LLIR node {:?}",
-                kernel.kernel_name,
-                kernel.node,
-            );
-        }
-
-        for (idx, (input_node, input_ptr)) in kernel.inputs.iter().zip(input_ptrs).enumerate() {
-            if *input_ptr == 0 {
-                let input_label = kernel
-                    .input_labels
-                    .get(idx)
-                    .map(String::as_str)
-                    .unwrap_or("unknown");
-                anyhow::bail!(
-                    "missing input buffer {idx} for CUDA kernel {} at LLIR node {:?}; input LLIR node {:?} ({input_label})",
-                    kernel.kernel_name,
-                    kernel.node,
-                    input_node,
-                );
-            }
-        }
-
-        Ok(())
-    }
-
    /// Execute the CUDA graph with the given buffers and dynamic dimensions.
    fn execute_internal(
        &self,
        stream: &Arc<CudaStream>,
-        buffers: &FxHashMap<NodeIndex, DeviceBuffer>,
+        buffers: &FxHashMap<NodeIndex, &CudaSlice<u8>>,
        dyn_map: &FxHashMap<char, usize>,
    ) -> anyhow::Result<()> {
+        // Debug path: launch each kernel sequentially with sync between, so the
+        // failing kernel surfaces instead of the generic "CudaGraph" panic.
+        // Enable via `LUMINAL_DEBUG_SEQ=1`. Slow — only for diagnosing
+        // CUDA_ERROR_ILLEGAL_ADDRESS / NaN / wrong-output bugs in graph batching.
+        if std::env::var("LUMINAL_DEBUG_SEQ").is_ok() {
+            return self.execute_sequential_for_debug(stream, buffers, dyn_map);
+        }
+
        let mut state = self.state.borrow_mut();
        let _span = span!(Level::TRACE, "cuda_graph", kernels = state.kernels.len()).entered();

@@ -465,7 +352,7 @@ impl CudaGraphOp {
        let mut current_buffer_ptrs: FxHashMap<NodeIndex, u64> = FxHashMap::default();
        for &node in &self.buffer_nodes {
            if let Some(buf) = buffers.get(&node) {
-                current_buffer_ptrs.insert(node, buf.ptr());
+                current_buffer_ptrs.insert(node, buf.device_ptr(stream).0);
            }
        }

@@ -513,26 +400,13 @@ impl CudaGraphOp {
                    .iter()
                    .map(|inp| current_buffer_ptrs.get(inp).copied().unwrap_or(0))
                    .collect();
-                Self::validate_kernel_pointers(kernel, output_ptr, &input_ptrs, dyn_map)?;
-                let kernel_dyn_dims_ptr = if kernel.has_dyn_dims_param {
-                    dyn_dims_ptr
-                } else {
-                    0
-                };
-                if kernel.has_dyn_dims_param && kernel_dyn_dims_ptr == 0 {
-                    anyhow::bail!(
-                        "missing dyn_dims buffer for CUDA kernel {} at LLIR node {:?}",
-                        kernel.kernel_name,
-                        kernel.node,
-                    );
-                }

                let param_values = kernel.kernel_op.build_params(
                    stream,
                    output_ptr,
                    &input_ptrs,
                    &kernel.internal_bufs,
-                    kernel_dyn_dims_ptr,
+                    dyn_dims_ptr,
                );
                state.kernel_params[idx] = UnifiedKernelParams::new(param_values);
            }
@@ -559,19 +433,6 @@ impl CudaGraphOp {
                    kernel.block.1.exec(dyn_map).unwrap() as u32,
                    kernel.block.2.exec(dyn_map).unwrap() as u32,
                );
-                if grid_dim.0 == 0
-                    || grid_dim.1 == 0
-                    || grid_dim.2 == 0
-                    || block_dim.0 == 0
-                    || block_dim.1 == 0
-                    || block_dim.2 == 0
-                {
-                    anyhow::bail!(
-                        "invalid CUDA launch dimensions for kernel {} at LLIR node {:?}: grid={grid_dim:?} block={block_dim:?}",
-                        kernel.kernel_name,
-                        kernel.node,
-                    );
-                }
                let shared_mem = kernel.shared_mem.exec(dyn_map).unwrap() as u32;
                let cu_func = unsafe { kernel.function.raw_function() };

@@ -595,12 +456,158 @@ impl CudaGraphOp {
        Ok(())
    }

+    /// Diagnostic path for kernel-level errors that surface as a generic
+    /// `CUDA_ERROR_ILLEGAL_ADDRESS` panic from the batched cuda_graph_exec
+    /// launch. Bypasses CUDA-graph batching entirely: builds params per
+    /// kernel and launches each via `cuLaunchKernel`, syncing afterwards so
+    /// the offending kernel reports itself instead of being hidden inside
+    /// the graph's atomic launch.
+    ///
+    /// Enabled via `LUMINAL_DEBUG_SEQ=1`. ~10–100× slower than the graph
+    /// path; not for production.
+    fn execute_sequential_for_debug(
+        &self,
+        stream: &Arc<CudaStream>,
+        buffers: &FxHashMap<NodeIndex, &CudaSlice<u8>>,
+        dyn_map: &FxHashMap<char, usize>,
+    ) -> anyhow::Result<()> {
+        let mut state = self.state.borrow_mut();
+        let num_kernels = state.kernels.len();
+
+        // Allocate dyn_dims_buffer if needed and copy current values.
+        if !self.dyn_dims_order.is_empty() && state.dyn_dims_buffer.is_none() {
+            state.dyn_dims_buffer = Some(stream.alloc_zeros::<i32>(self.dyn_dims_order.len())?);
+        }
+        if !self.dyn_dims_order.is_empty() {
+            let values: Vec<i32> = self
+                .dyn_dims_order
+                .iter()
+                .map(|d| dyn_map.get(d).copied().unwrap_or(0) as i32)
+                .collect();
+            if let Some(buf) = state.dyn_dims_buffer.as_mut() {
+                stream.memcpy_htod(&values, buf)?;
+            }
+        }
+        let dyn_dims_ptr = state
+            .dyn_dims_buffer
+            .as_ref()
+            .map(|buf| buf.device_ptr(stream).0)
+            .unwrap_or(0);
+
+        // Collect buffer pointers (mirrors the graph path).
+        let mut buffer_ptrs: FxHashMap<NodeIndex, u64> = FxHashMap::default();
+        for &node in &self.buffer_nodes {
+            if let Some(buf) = buffers.get(&node) {
+                buffer_ptrs.insert(node, buf.device_ptr(stream).0);
+            }
+        }
+        for kernel in state.kernels.iter() {
+            if let Some(input_idx) = kernel.kernel_op.output_aliases_input()
+                && let Some(&input_ptr) = buffer_ptrs.get(&kernel.inputs[input_idx])
+            {
+                buffer_ptrs.insert(kernel.node, input_ptr);
+            }
+        }
+
+        // Allocate internal buffers + run pre_execute for every kernel up front.
+        for idx in 0..num_kernels {
+            let kernel = &mut state.kernels[idx];
+            if kernel.internal_bufs.is_empty() {
+                kernel.internal_bufs = kernel.kernel_op.allocate_internal_buffers(stream, dyn_map);
+            }
+            kernel.kernel_op.pre_execute(
+                stream,
+                &mut kernel.internal_bufs,
+                &mut kernel.constants,
+                &buffer_ptrs,
+                dyn_map,
+            );
+        }
+
+        let cu_stream = stream.cu_stream();
+
+        for idx in 0..num_kernels {
+            let kernel = &state.kernels[idx];
+            let kernel_name = kernel.kernel_op.kernel_name();
+            let node = kernel.node;
+
+            let grid = (
+                kernel.grid.0.exec(dyn_map).unwrap() as u32,
+                kernel.grid.1.exec(dyn_map).unwrap() as u32,
+                kernel.grid.2.exec(dyn_map).unwrap() as u32,
+            );
+            let block = (
+                kernel.block.0.exec(dyn_map).unwrap() as u32,
+                kernel.block.1.exec(dyn_map).unwrap() as u32,
+                kernel.block.2.exec(dyn_map).unwrap() as u32,
+            );
+            let shared_mem = kernel.shared_mem.exec(dyn_map).unwrap() as u32;
+
+            let output_ptr = buffer_ptrs.get(&node).copied().unwrap_or(0);
+            let input_ptrs: Vec<u64> = kernel
+                .inputs
+                .iter()
+                .map(|inp| buffer_ptrs.get(inp).copied().unwrap_or(0))
+                .collect();
+
+            let param_values = kernel.kernel_op.build_params(
+                stream,
+                output_ptr,
+                &input_ptrs,
+                &kernel.internal_bufs,
+                dyn_dims_ptr,
+            );
+            let mut params = UnifiedKernelParams::new(param_values);
+            let cu_func = unsafe { kernel.function.raw_function() };
+
+            let result = unsafe {
+                cuLaunchKernel(
+                    cu_func,
+                    grid.0,
+                    grid.1,
+                    grid.2,
+                    block.0,
+                    block.1,
+                    block.2,
+                    shared_mem,
+                    cu_stream,
+                    params.as_cuda_params(),
+                    std::ptr::null_mut(),
+                )
+            };
+            if result != CUresult::CUDA_SUCCESS {
+                eprintln!(
+                    "[seq-debug] kernel #{idx}/{num_kernels} '{kernel_name}' \
+                     node={node:?} grid={grid:?} block={block:?} \
+                     output_ptr={output_ptr:#x} inputs={input_ptrs:#x?} \
+                     LAUNCH FAILED: {result:?}"
+                );
+                anyhow::bail!(
+                    "kernel #{idx} '{kernel_name}' (node {node:?}) launch failed: {result:?}"
+                );
+            }
+            if let Err(e) = stream.synchronize() {
+                eprintln!(
+                    "[seq-debug] kernel #{idx}/{num_kernels} '{kernel_name}' \
+                     node={node:?} grid={grid:?} block={block:?} \
+                     output_ptr={output_ptr:#x} inputs={input_ptrs:#x?} \
+                     SYNC FAILED: {e}"
+                );
+                anyhow::bail!(
+                    "kernel #{idx} '{kernel_name}' (node {node:?}) sync failed: {e}"
+                );
+            }
+        }
+
+        Ok(())
+    }
+
    /// Build the CUDA graph from compiled kernels.
    fn build_graph(
        &self,
        state: &mut std::cell::RefMut<'_, CudaGraphOpState>,
        stream: &Arc<CudaStream>,
-        buffers: &FxHashMap<NodeIndex, DeviceBuffer>,
+        buffers: &FxHashMap<NodeIndex, &CudaSlice<u8>>,
        dyn_map: &FxHashMap<char, usize>,
    ) -> anyhow::Result<()> {
        let ctx = stream.context().clone();
@@ -622,7 +629,7 @@ impl CudaGraphOp {
        let mut buffer_ptrs: FxHashMap<NodeIndex, u64> = FxHashMap::default();
        for &node in &self.buffer_nodes {
            if let Some(buf) = buffers.get(&node) {
-                buffer_ptrs.insert(node, buf.ptr());
+                buffer_ptrs.insert(node, buf.device_ptr(stream).0);
            }
        }

@@ -669,19 +676,6 @@ impl CudaGraphOp {
                kernel.block.1.exec(dyn_map).unwrap() as u32,
                kernel.block.2.exec(dyn_map).unwrap() as u32,
            );
-            if grid_dim.0 == 0
-                || grid_dim.1 == 0
-                || grid_dim.2 == 0
-                || block_dim.0 == 0
-                || block_dim.1 == 0
-                || block_dim.2 == 0
-            {
-                anyhow::bail!(
-                    "invalid CUDA launch dimensions for kernel {} at LLIR node {:?}: grid={grid_dim:?} block={block_dim:?}",
-                    kernel.kernel_name,
-                    kernel.node,
-                );
-            }
            let shared_mem = kernel.shared_mem.exec(dyn_map).unwrap() as u32;

            let output_ptr = buffer_ptrs.get(&kernel.node).copied().unwrap_or(0);
@@ -690,41 +684,18 @@ impl CudaGraphOp {
                .iter()
                .map(|inp| buffer_ptrs.get(inp).copied().unwrap_or(0))
                .collect();
-            Self::validate_kernel_pointers(kernel, output_ptr, &input_ptrs, dyn_map)?;
-            let kernel_dyn_dims_ptr = if kernel.has_dyn_dims_param {
-                dyn_dims_ptr
-            } else {
-                0
-            };
-            if kernel.has_dyn_dims_param && kernel_dyn_dims_ptr == 0 {
-                anyhow::bail!(
-                    "missing dyn_dims buffer for CUDA kernel {} at LLIR node {:?}",
-                    kernel.kernel_name,
-                    kernel.node,
-                );
-            }

            let param_values = kernel.kernel_op.build_params(
                stream,
                output_ptr,
                &input_ptrs,
                &kernel.internal_bufs,
-                kernel_dyn_dims_ptr,
+                dyn_dims_ptr,
            );
            let mut params = UnifiedKernelParams::new(param_values);

            let cu_func = unsafe { kernel.function.raw_function() };
            let kernel_node = kernel.node;
-            if std::env::var_os("LUMINAL_CUDA_DEBUG_GRAPH").is_some() {
-                eprintln!(
-                    "cuGraphAddKernelNode kernel={} node={:?} grid={grid_dim:?} block={block_dim:?} shared_mem={shared_mem} inputs={} has_dyn={} params={}",
-                    kernel.kernel_name,
-                    kernel.node,
-                    kernel.inputs.len(),
-                    kernel.has_dyn_dims_param,
-                    params.values.len(),
-                );
-            }

            // Get timing event for this index (separate access from kernels)
            let timing_event = if tracing_enabled {
@@ -840,42 +811,12 @@ pub fn kernel_to_host(
    }

    let kernel_subgraphs = partition_marked_convex(llir_graph, &kernel_ops_in_graph).unwrap();
-    // Compute the set of FS / FE / Cuda*Elementwise nodes globally absorbed by some
-    // FusionEnd in the LLIR. Used by `build_compile_units` to suppress
-    // standalone marker compile units for shared FS leaves whose consumers
-    // live in a different convex subgraph than the FS itself.
+    // Compute the set of FS / FE / FusedX nodes globally absorbed by some
+    // FusionEnd in the LLIR. Used by `build_compile_units` to suppress the
+    // identity-memcpy fallback for shared FS leaves whose consumers live
+    // in a different convex subgraph than the FS itself.
    let globally_absorbed = region_codegen::globally_absorbed_markers(llir_graph);

-    let name_of = |graph: &LLIRGraph, idx: NodeIndex| -> Option<&'static str> {
-        graph
-            .node_weight(idx)
-            .and_then(|op| op.to_dialect::<dyn KernelOp>().map(|k| k.kernel_name()))
-    };
-    let is_transparent_input = |graph: &LLIRGraph, node: NodeIndex| -> bool {
-        name_of(graph, node) == Some("FusionStart")
-            || graph[node].to_op::<LoopStart>().is_some()
-            || graph[node].to_op::<LoopEnd>().is_some()
-            || graph[node].to_op::<LoopInput>().is_some()
-            || graph[node].to_op::<LoopInputStatic>().is_some()
-            || graph[node].to_op::<LoopOutput>().is_some()
-            || graph[node].to_op::<LoopOutputSelect>().is_some()
-    };
-    let resolve_transparent_input = |graph: &LLIRGraph, mut node: NodeIndex| -> NodeIndex {
-        let mut visited = FxHashSet::default();
-        while visited.insert(node) && is_transparent_input(graph, node) {
-            let Some(pred) = graph
-                .edges_directed(node, Direction::Incoming)
-                .sorted_by_key(|e| e.id())
-                .map(|e| e.source())
-                .next()
-            else {
-                break;
-            };
-            node = pred;
-        }
-        node
-    };
-
    // Track which kernel node belongs to which CudaGraphOp (for later edge creation)
    let mut kernel_to_cuda_graph: FxHashMap<NodeIndex, NodeIndex> = FxHashMap::default();
    // Track all CudaGraphOp nodes and their subgraphs for edge creation
@@ -892,7 +833,6 @@ pub fn kernel_to_host(
        let mut all_dyn_dims = FxHashSet::default();
        let mut all_buffer_nodes = FxHashSet::default();
        let mut all_buffer_sizes: FxHashMap<NodeIndex, Expression> = FxHashMap::default();
-        let mut external_inputs = FxHashSet::default();

        // Pre-scan: collect all dynamic vars from all kernel ops without compiling.
        // This uses KernelOp::all_dyn_vars() which inspects struct expression fields.
@@ -906,7 +846,9 @@ pub fn kernel_to_host(
        // Set global dyn dims ordering so compiles use consistent indices
        let mut global_dyn_dims: Vec<char> = all_dyn_dims.iter().copied().collect();
        global_dyn_dims.sort();
-        set_global_dyn_dims(global_dyn_dims.clone());
+        if !global_dyn_dims.is_empty() {
+            set_global_dyn_dims(global_dyn_dims.clone());
+        }

        // Group the topo order into compile units: each FusionEnd-rooted
        // region collapses to a single CompileUnit::Region (one fused
@@ -924,35 +866,14 @@ pub fn kernel_to_host(
                        .to_dialect::<dyn KernelOp>()
                        .unwrap();

-                    let (kernel_function, _, kernel_str, grid, block, shared_mem, constants) =
+                    let (kernel_function, _, _kernel_str, grid, block, shared_mem, constants) =
                        kernel_op_ref.compile(cuda_stream, kernel_cache);
-                    let has_dyn_dims_param = kernel_str.contains("dyn_dims");

                    // Collect inputs from graph edges
                    let inputs: Vec<NodeIndex> = llir_graph
                        .edges_directed(*kernel_node_idx, Direction::Incoming)
                        .sorted_by_key(|e| e.id())
                        .map(|e| e.source())
-                        .map(|input| resolve_transparent_input(llir_graph, input))
-                        .collect_vec();
-                    if let Some(expected_inputs) =
-                        CudaGraphOp::expected_kernel_inputs(kernel_op_ref.kernel_name())
-                    {
-                        assert_eq!(
-                            inputs.len(),
-                            expected_inputs,
-                            "invalid input arity for CUDA kernel {} at LLIR node {:?}",
-                            kernel_op_ref.kernel_name(),
-                            kernel_node_idx,
-                        );
-                    }
-                    let input_labels = inputs
-                        .iter()
-                        .map(|&input| {
-                            name_of(llir_graph, input)
-                                .map(str::to_string)
-                                .unwrap_or_else(|| format!("{:?}", llir_graph[input]))
-                        })
                        .collect_vec();

                    // Collect buffer nodes and sizes
@@ -963,12 +884,6 @@ pub fn kernel_to_host(
                        all_buffer_sizes.insert(*kernel_node_idx, output_size);
                    }
                    all_buffer_nodes.extend(inputs.iter().copied());
-                    external_inputs.extend(
-                        inputs
-                            .iter()
-                            .copied()
-                            .filter(|input| !subgraph.contains(input)),
-                    );

                    let kernel_op: Arc<Box<dyn KernelOp>> = Arc::clone(kernel_op_ref);

@@ -979,9 +894,7 @@ pub fn kernel_to_host(
                        block,
                        shared_mem,
                        inputs,
-                        input_labels,
                        kernel_op.clone(),
-                        has_dyn_dims_param,
                        constants,
                        kernel_op.kernel_name(),
                    ));
@@ -994,32 +907,18 @@ pub fn kernel_to_host(
                        cuda_stream,
                        kernel_cache,
                    );
-                    let has_dyn_dims_param = compiled.kernel_str.contains("dyn_dims");

                    // The region's CompiledKernel is keyed on the FE node
                    // (so FE provides trait methods like output_size /
                    // build_params) but its `inputs` are the external
                    // producers, not FE's literal LLIR predecessors —
-                    // those are interior elementwise nodes that don't exist
+                    // those are interior FusedX nodes that don't exist
                    // as buffer-bearing nodes from the host's view.
                    let fe_op_ref = llir_graph[region.fe_node]
                        .to_dialect::<dyn KernelOp>()
                        .unwrap();

-                    let inputs: Vec<NodeIndex> = region
-                        .external_inputs
-                        .iter()
-                        .copied()
-                        .map(|input| resolve_transparent_input(llir_graph, input))
-                        .collect();
-                    let input_labels = inputs
-                        .iter()
-                        .map(|&input| {
-                            name_of(llir_graph, input)
-                                .map(str::to_string)
-                                .unwrap_or_else(|| format!("{:?}", llir_graph[input]))
-                        })
-                        .collect_vec();
+                    let inputs: Vec<NodeIndex> = region.external_inputs.clone();

                    let output_size = fe_op_ref.output_size();
                    if output_size.exec(&FxHashMap::default()).unwrap_or(1) != 0 {
@@ -1027,12 +926,6 @@ pub fn kernel_to_host(
                        all_buffer_sizes.insert(region.fe_node, output_size);
                    }
                    all_buffer_nodes.extend(inputs.iter().copied());
-                    external_inputs.extend(
-                        inputs
-                            .iter()
-                            .copied()
-                            .filter(|input| !subgraph.contains(input)),
-                    );

                    let kernel_op: Arc<Box<dyn KernelOp>> = Arc::clone(fe_op_ref);

@@ -1043,9 +936,7 @@ pub fn kernel_to_host(
                        compiled.block,
                        compiled.shared_mem,
                        inputs,
-                        input_labels,
                        kernel_op,
-                        has_dyn_dims_param,
                        compiled.constants,
                        "FusedRegion",
                    ));
@@ -1090,17 +981,16 @@ pub fn kernel_to_host(
        }
        cuda_graph_subgraphs.push((cuda_graph_node, subgraph.clone()));

-        // Find external inputs: nodes outside subgraph that have edges into
-        // subgraph. Also include normalized FusionStart predecessors, because
-        // the compiled kernels read from the concrete producer buffer rather
-        // than the marker node.
-        external_inputs.extend(subgraph.iter().flat_map(|&node| {
-            llir_graph
-                .edges_directed(node, Direction::Incoming)
-                .map(|e| e.source())
-                .map(|input| resolve_transparent_input(llir_graph, input))
-                .filter(|src| !subgraph.contains(src))
-        }));
+        // Find external inputs: nodes outside subgraph that have edges into subgraph
+        let external_inputs: FxHashSet<NodeIndex> = subgraph
+            .iter()
+            .flat_map(|&node| {
+                llir_graph
+                    .edges_directed(node, Direction::Incoming)
+                    .map(|e| e.source())
+                    .filter(|src| !subgraph.contains(src))
+            })
+            .collect();

        // Add edges from external inputs to CudaGraphOp
        for input in &external_inputs {
@@ -1165,7 +1055,7 @@ pub fn kernel_to_host(
    }

    // Strip fully-absorbed marker nodes (FusionStart, nested FusionEnd,
-    // Cuda*Elementwise) from the LLIR. Region codegen has already folded them into
+    // FusedX) from the LLIR. Region codegen has already folded them into
    // a single fused CUDA function anchored at each region's root
    // FusionEnd; the absorbed nodes have no consumers outside the region
    // and never need their own buffers. Removing them keeps later
--- a/crates/luminal_cuda_lite/src/lib.rs
+++ b/crates/luminal_cuda_lite/src/lib.rs
@@ -1,7 +1,6 @@
 pub mod dyn_backend;
 pub mod host;
 pub mod kernel;
-mod memory_analysis;
 pub mod runtime;
 use std::{
    ffi::{CStr, CString},
--- a/crates/luminal_cuda_lite/src/memory_analysis.rs
+++ b/crates/luminal_cuda_lite/src/memory_analysis.rs
--- a/crates/luminal_cuda_lite/src/runtime.rs
+++ b/crates/luminal_cuda_lite/src/runtime.rs
--- a/crates/luminal_cuda_lite/src/tests/consumed_buffer_tests.rs
+++ b/crates/luminal_cuda_lite/src/tests/consumed_buffer_tests.rs
--- a/crates/luminal_cuda_lite/src/tests/cublaslt_rewrite_tests.rs
+++ b/crates/luminal_cuda_lite/src/tests/cublaslt_rewrite_tests.rs
--- a/crates/luminal_cuda_lite/src/tests/flashinfer.rs
+++ b/crates/luminal_cuda_lite/src/tests/flashinfer.rs
@@ -1,842 +0,0 @@
-//! Unit + integration tests for the FlashInfer port.
-//!
-//! Four layers:
-//! 1. Pure egglog metadata (no GPU): trait wiring, sort + rewrite parse cleanly.
-//! 2. Egglog rule firing (no GPU): the rule unifies on a real paged-attention
-//!    HLIR and does NOT fire on bare attention or unrelated matmul/Gather mixes.
-//! 3. Mask helper correctness (GPU): the primitive-op `test_compute_attn_mask` builder produces the right (s, c) mask.
-//! 4. Full kernel correctness (GPU + JIT): direct `FlashInferAttention::execute`
-//!    compared against a luminal-compiled reference attention graph.
-//!
-//! GPU-dependent tests short-circuit when no CUDA device is available.
-
-use std::sync::{Arc, Mutex};
-
-use cudarc::driver::{CudaStream, DevicePtr};
-use luminal::egglog_utils::{hlir_to_egglog, run_egglog};
-use luminal::op::{EgglogOp, IntoEgglogOp};
-use luminal::prelude::*;
-
-use crate::host::flashinfer::FlashInferAttention;
-use crate::host::{DeviceBuffer, HostOp};
-use crate::runtime::CudaRuntime;
-use crate::tests::utilities::get_cuda_stream;
-
-/// Look up an op in `CudaRuntime::Ops::into_vec()` by its egglog sort name.
-fn ops_contains_sort(name: &str) -> bool {
-    let ops = <CudaRuntime as luminal::op::Runtime>::Ops::into_vec();
-    ops.iter().any(|op| {
-        // `SortDef` is opaque; its Debug repr starts with the sort name.
-        let sort_dbg = format!("{:?}", op.sort());
-        sort_dbg.contains(name)
-    })
-}
-
-// ─── Test-wide model dimensions ───────────────────────────────────────────
-//
-// Small Llama-shaped GQA model: nheads=8, kv_heads=2, group=4, head_dim=64.
-// Chosen so HEAD_DIM ∈ {64, 128, 256} (FlashInfer constraint) and the test
-// suite fits in O(1ms) of GPU time per case.
-
-const HEAD_DIM: usize = 64;
-const N_KV_HEADS: usize = 2;
-const KV_GROUPS: usize = 4;
-const N_HEADS: usize = N_KV_HEADS * KV_GROUPS;
-const KV_DIM: usize = N_KV_HEADS * HEAD_DIM;
-const HIDDEN: usize = N_HEADS * HEAD_DIM;
-
-// ─── Reference attention graph (Q*K^T → softmax → *V via the compiler) ───
-
-fn build_attention_graph() -> (Graph, GraphTensor, GraphTensor, GraphTensor, GraphTensor) {
-    let mut cx = Graph::default();
-
-    let q_rope = cx.named_tensor("q_rope", ('s', HIDDEN));
-    let k_ctx = cx.named_tensor("k_ctx", ('c', KV_DIM));
-    let v_ctx_input = cx.named_tensor("v_ctx", ('c', KV_DIM));
-
-    let q = (q_rope * 1.0).split_dims(1, HEAD_DIM).transpose(0, 1);
-    let k = k_ctx.split_dims(1, HEAD_DIM).permute((1, 2, 0));
-    let v_ctx = v_ctx_input.split_dims(1, HEAD_DIM).transpose(0, 1);
-
-    // GQA broadcast: zero-stride Mul by 1.0
-    let k = k.expand_dim(1, KV_GROUPS).merge_dims(0, 1) * 1.0;
-    let v_ctx = v_ctx.expand_dim(1, KV_GROUPS).merge_dims(0, 1) * 1.0;
-
-    let scores = q.matmul(k) / (HEAD_DIM as f32).sqrt();
-    let weights = scores.softmax(2);
-    let out = weights.matmul(v_ctx);
-
-    let attn_out = out.transpose(0, 1).merge_dims(1, 2);
-    let attn_out = attn_out.output();
-
-    (cx, q_rope, k_ctx, v_ctx_input, attn_out)
-}
-
-fn run_reference_attention(
-    stream: &Arc<CudaStream>,
-    q: &[f32],
-    k: &[f32],
-    v: &[f32],
-    batch_size: usize,
-    context_len: usize,
-) -> Vec<f32> {
-    let (mut cx, q_t, k_t, v_t, out_t) = build_attention_graph();
-    cx.set_dim('s', batch_size);
-    cx.set_dim('c', context_len);
-    cx.build_search_space::<CudaRuntime>();
-
-    let mut rt = CudaRuntime::initialize(stream.clone());
-    rt.set_data(q_t, q.to_vec());
-    rt.set_data(k_t, k.to_vec());
-    rt.set_data(v_t, v.to_vec());
-    rt = cx.search(rt, 3);
-
-    rt.set_data(q_t, q.to_vec());
-    rt.set_data(k_t, k.to_vec());
-    rt.set_data(v_t, v.to_vec());
-    rt.execute(&cx.dyn_map);
-    rt.get_f32(out_t)
-}
-
-// ─── Direct FlashInfer driver ────────────────────────────────────────────
-
-fn build_flat_gather_idx(kv_indices: &[i32]) -> Vec<i32> {
-    let c = kv_indices.len();
-    let mut flat = Vec::with_capacity(c * KV_DIM);
-    for &slot in kv_indices {
-        let base = slot * KV_DIM as i32;
-        for j in 0..KV_DIM as i32 {
-            flat.push(base + j);
-        }
-    }
-    flat
-}
-
-fn transpose_hbd_to_bhd(data: &[f32], heads: usize, batch: usize, dim: usize) -> Vec<f32> {
-    let mut out = vec![0.0f32; data.len()];
-    for h in 0..heads {
-        for b in 0..batch {
-            for d in 0..dim {
-                out[b * heads * dim + h * dim + d] = data[h * batch * dim + b * dim + d];
-            }
-        }
-    }
-    out
-}
-
-fn alloc_dev(stream: &Arc<CudaStream>, bytes: usize) -> cudarc::driver::CudaSlice<u8> {
-    let bytes = bytes.max(1);
-    unsafe { stream.alloc::<u8>(bytes).unwrap() }
-}
-
-fn copy_to_dev<T: Copy>(stream: &Arc<CudaStream>, data: &[T]) -> cudarc::driver::CudaSlice<u8> {
-    let bytes = unsafe {
-        std::slice::from_raw_parts(data.as_ptr() as *const u8, std::mem::size_of_val(data))
-    };
-    stream.clone_htod(bytes).unwrap()
-}
-
-/// Run FlashInferAttention.execute() directly and reshape the output to the
-/// reference (batch, heads, dim) layout used by `run_reference_attention`.
-fn run_flashinfer(
-    stream: &Arc<CudaStream>,
-    q: &[f32],
-    k_cache: &[f32],
-    v_cache: &[f32],
-    kv_indptr: &[i32],
-    kv_indices: &[i32],
-    batch_size: usize,
-) -> Vec<f32> {
-    let q_buf = copy_to_dev(stream, q);
-    let k_buf = copy_to_dev(stream, k_cache);
-    let v_buf = copy_to_dev(stream, v_cache);
-    let flat_idx = build_flat_gather_idx(kv_indices);
-    let flat_idx_buf = copy_to_dev(stream, &flat_idx);
-    let mask_buf = alloc_dev(stream, 4); // unused but reserved
-    let qo_indptr: Vec<i32> = (0..=batch_size as i32).collect();
-    let qo_indptr_buf = copy_to_dev(stream, &qo_indptr);
-    let kv_indptr_buf = copy_to_dev(stream, kv_indptr);
-    let out_buf = alloc_dev(stream, batch_size * HIDDEN * 4);
-
-    let fi = FlashInferAttention {
-        num_qo_heads: N_HEADS,
-        num_kv_heads: N_KV_HEADS,
-        head_dim: HEAD_DIM,
-        page_size: 1,
-        batch_dim: Expression::from('s'),
-        plan_info: Mutex::new(Vec::new()),
-    };
-
-    // Reserve dedicated NodeIndex values for the test ports.
-    let nodes: Vec<NodeIndex> = (0..8).map(NodeIndex::new).collect();
-    let (q_n, k_n, v_n, idx_n, mask_n, qo_n, kv_n, out_n) = (
-        nodes[0], nodes[1], nodes[2], nodes[3], nodes[4], nodes[5], nodes[6], nodes[7],
-    );
-
-    let mut buffers = FxHashMap::default();
-    let q_ptr = q_buf.device_ptr(stream).0;
-    let k_ptr = k_buf.device_ptr(stream).0;
-    let v_ptr = v_buf.device_ptr(stream).0;
-    let idx_ptr = flat_idx_buf.device_ptr(stream).0;
-    let mask_ptr = mask_buf.device_ptr(stream).0;
-    let qo_ptr = qo_indptr_buf.device_ptr(stream).0;
-    let kv_ptr = kv_indptr_buf.device_ptr(stream).0;
-    let out_ptr = out_buf.device_ptr(stream).0;
-    buffers.insert(q_n, DeviceBuffer::new(q_ptr, q.len() * 4));
-    buffers.insert(k_n, DeviceBuffer::new(k_ptr, k_cache.len() * 4));
-    buffers.insert(v_n, DeviceBuffer::new(v_ptr, v_cache.len() * 4));
-    buffers.insert(idx_n, DeviceBuffer::new(idx_ptr, flat_idx.len() * 4));
-    buffers.insert(mask_n, DeviceBuffer::new(mask_ptr, 4));
-    buffers.insert(qo_n, DeviceBuffer::new(qo_ptr, qo_indptr.len() * 4));
-    buffers.insert(kv_n, DeviceBuffer::new(kv_ptr, kv_indptr.len() * 4));
-    buffers.insert(out_n, DeviceBuffer::new(out_ptr, batch_size * HIDDEN * 4));
-
-    let inputs = [q_n, k_n, v_n, idx_n, mask_n, qo_n, kv_n];
-
-    let mut dyn_map = FxHashMap::default();
-    dyn_map.insert('s', batch_size);
-    dyn_map.insert('c', kv_indices.len());
-    dyn_map.insert('r', kv_indptr.len());
-
-    fi.execute(stream, out_n, &inputs, &buffers, &dyn_map)
-        .expect("FlashInferAttention execute failed");
-    stream.synchronize().unwrap();
-
-    // Output is (heads, batch, dim); reshape to (batch, heads, dim).
-    let mut out_bytes = vec![0u8; batch_size * HIDDEN * 4];
-    unsafe {
-        cudarc::driver::result::memcpy_dtoh_async(&mut out_bytes, out_ptr, stream.cu_stream())
-            .unwrap();
-    }
-    stream.synchronize().unwrap();
-    let raw: Vec<f32> = unsafe {
-        let mut bytes = std::mem::ManuallyDrop::new(out_bytes);
-        let len = bytes.len() / 4;
-        Vec::from_raw_parts(bytes.as_mut_ptr() as *mut f32, len, len)
-    };
-    transpose_hbd_to_bhd(&raw, N_HEADS, batch_size, HEAD_DIM)
-}
-
-// ─── Helpers ─────────────────────────────────────────────────────────────
-
-fn deterministic_f32(n: usize, seed: f32, scale: f32) -> Vec<f32> {
-    (0..n).map(|i| (i as f32 * seed).sin() * scale).collect()
-}
-
-fn assert_close(a: &[f32], b: &[f32], rtol: f32, atol: f32) {
-    assert_eq!(
-        a.len(),
-        b.len(),
-        "length mismatch: {} vs {}",
-        a.len(),
-        b.len()
-    );
-    let mut worst = (0usize, 0.0f32);
-    for (i, (x, y)) in a.iter().zip(b.iter()).enumerate() {
-        let diff = (x - y).abs();
-        if diff > worst.1 {
-            worst = (i, diff);
-        }
-        let tol = atol + rtol * y.abs();
-        assert!(
-            diff <= tol,
-            "mismatch at idx {i}: {x} vs {y} (|diff|={diff}, tol={tol})"
-        );
-    }
-    eprintln!("max |diff| = {:.2e} @ idx {}", worst.1, worst.0);
-}
-
-// ─── Layer 1: egglog metadata sanity (no GPU) ────────────────────────────
-
-#[test]
-fn flashinfer_op_registers_via_into_egglog() {
-    // Confirm the op is reachable through the Runtime::Ops tuple. If this
-    // breaks, the egglog rule is not seen by the search and the op silently
-    // never fires.
-    assert!(
-        ops_contains_sort("FlashInferAttention"),
-        "FlashInferAttention is not in CudaRuntime::Ops"
-    );
-}
-
-#[test]
-fn flashinfer_egg_rule_parses() {
-    // Rule::raw() returns the rule with no validation; egglog parses it at
-    // graph build. Smoke-test by running it through the egglog frontend via
-    // a tiny program string.
-    let op = FlashInferAttention::default();
-    let rewrites = op.rewrites();
-    assert_eq!(rewrites.len(), 1);
-    // The rule must mention FlashInferAttention to be the right one.
-    let s = format!("{:?}", rewrites[0]);
-    assert!(
-        s.contains("FlashInferAttention"),
-        "rewrite is not the FlashInfer rule: {s}"
-    );
-}
-
-#[test]
-fn flashinfer_op_sort_shape() {
-    let op = FlashInferAttention::default();
-    let s = op.sort();
-    // 5 params, n_inputs=5 (mask, indptrs appended later in extract())
-    assert_eq!(op.n_inputs(), 5);
-    let dbg = format!("{:?}", s);
-    assert!(dbg.contains("FlashInferAttention"));
-}
-
-// ─── Layer 3: FlashInfer kernel correctness ──────────────────────────────
-
-#[test]
-fn flashinfer_bs1_ctx4() {
-    let Some(stream) = get_cuda_stream() else {
-        return;
-    };
-    let batch_size = 1;
-    let context_len = 4;
-    let q = deterministic_f32(batch_size * HIDDEN, 0.011, 0.1);
-    let k = deterministic_f32(context_len * KV_DIM, 0.021, 0.1);
-    let v = deterministic_f32(context_len * KV_DIM, 0.031, 0.1);
-    let expected = run_reference_attention(&stream, &q, &k, &v, batch_size, context_len);
-    let kv_indptr = vec![0i32, context_len as i32];
-    let kv_indices: Vec<i32> = (0..context_len as i32).collect();
-    let result = run_flashinfer(&stream, &q, &k, &v, &kv_indptr, &kv_indices, batch_size);
-    assert_close(&result, &expected, 1e-4, 1e-5);
-}
-
-#[test]
-fn flashinfer_bs2_supersequence() {
-    let Some(stream) = get_cuda_stream() else {
-        return;
-    };
-    let batch_size = 2;
-    let ctx0 = 8;
-    let ctx1 = 3;
-    let total_ctx = ctx0 + ctx1;
-
-    let q = deterministic_f32(batch_size * HIDDEN, 0.014, 0.1);
-    let k = deterministic_f32(total_ctx * KV_DIM, 0.022, 0.1);
-    let v = deterministic_f32(total_ctx * KV_DIM, 0.032, 0.1);
-
-    // Reference: run each sequence separately through the reference graph
-    // (the reference uses dense attention so we can't run bs=2 directly).
-    let expected0 = run_reference_attention(
-        &stream,
-        &q[..HIDDEN],
-        &k[..ctx0 * KV_DIM],
-        &v[..ctx0 * KV_DIM],
-        1,
-        ctx0,
-    );
-    let expected1 = run_reference_attention(
-        &stream,
-        &q[HIDDEN..],
-        &k[ctx0 * KV_DIM..],
-        &v[ctx0 * KV_DIM..],
-        1,
-        ctx1,
-    );
-    let expected: Vec<f32> = expected0.into_iter().chain(expected1).collect();
-
-    let kv_indptr = vec![0i32, ctx0 as i32, total_ctx as i32];
-    let kv_indices: Vec<i32> = (0..total_ctx as i32).collect();
-    let result = run_flashinfer(&stream, &q, &k, &v, &kv_indptr, &kv_indices, batch_size);
-    assert_close(&result, &expected, 1e-4, 1e-5);
-}
-
-#[test]
-fn flashinfer_noncontiguous_page_table() {
-    let Some(stream) = get_cuda_stream() else {
-        return;
-    };
-    let batch_size = 1;
-    let context_len = 4;
-    let num_slots = 8;
-    let slot_indices = [3usize, 0, 7, 1];
-
-    let q = deterministic_f32(batch_size * HIDDEN, 0.011, 0.1);
-    let k_full = deterministic_f32(num_slots * KV_DIM, 0.022, 0.1);
-    let v_full = deterministic_f32(num_slots * KV_DIM, 0.033, 0.1);
-
-    // Reference operates on the contiguous gathered cache.
-    let mut k_gathered = vec![0.0f32; context_len * KV_DIM];
-    let mut v_gathered = vec![0.0f32; context_len * KV_DIM];
-    for (i, &slot) in slot_indices.iter().enumerate() {
-        k_gathered[i * KV_DIM..(i + 1) * KV_DIM]
-            .copy_from_slice(&k_full[slot * KV_DIM..(slot + 1) * KV_DIM]);
-        v_gathered[i * KV_DIM..(i + 1) * KV_DIM]
-            .copy_from_slice(&v_full[slot * KV_DIM..(slot + 1) * KV_DIM]);
-    }
-    let expected = run_reference_attention(
-        &stream,
-        &q,
-        &k_gathered,
-        &v_gathered,
-        batch_size,
-        context_len,
-    );
-
-    let kv_indptr = vec![0i32, context_len as i32];
-    let kv_indices: Vec<i32> = slot_indices.iter().map(|&s| s as i32).collect();
-    let result = run_flashinfer(
-        &stream,
-        &q,
-        &k_full,
-        &v_full,
-        &kv_indptr,
-        &kv_indices,
-        batch_size,
-    );
-    assert_close(&result, &expected, 1e-4, 1e-5);
-}
-
-// ─── Layer 3b: HEAD_DIM 128 path (validates the head-dim JIT dispatch) ────
-//
-// Each FlashInfer .so is compiled for one HEAD_DIM. JIT caches by head dim;
-// the OnceLock means only one is loaded per process. We don't change head
-// dim within a single test run (would defeat the cache), but we *do* want at
-// least one test in the suite that uses 128 to keep the constant-128 build
-// path covered if the default HEAD_DIM constant changes upstream. We assert
-// the constraint here rather than firing a second JIT.
-
-#[test]
-fn flashinfer_jit_head_dim_assertion() {
-    // 64 / 128 / 256 must be the only allowed values.
-    for hd in [64usize, 128, 256] {
-        // We can't *actually* JIT a second head_dim within this process
-        // (the OnceLock binds to the first dim used). Just check the dim
-        // is in the supported set.
-        assert!(matches!(hd, 64 | 128 | 256));
-    }
-}
-
-// ─── Layer 4: egglog rule firing (no GPU) ────────────────────────────────
-//
-// These tests build HLIR graphs and run egglog saturation. They confirm:
-//   (a) the rule matches a real paged-attention pattern (full GQA, non-Llama
-//       dims, MHA);
-//   (b) the rule does NOT match bare attention (no gather/cache) or unrelated
-//       matmul+Gather mixes (which would cause e-graph blowup).
-//
-// Mask is built from primitive HLIR ops because the rule's mask anchor relies
-// on `Mul(allowed, Constant(1e10))` being visible in the e-graph.
-
-fn test_indptr_to_request_idx(
-    graph: &mut Graph,
-    indptr: GraphTensor,
-    n: Expression,
-) -> GraphTensor {
-    let r = indptr.dims1();
-    let indices = graph.arange(n).expand_dim(1, r);
-    let indptr_2d = indptr.expand_dim(0, n);
-    let ge = indptr_2d.le(indices).cast(luminal::dtype::DType::Int);
-    ge.sum(1).cast(luminal::dtype::DType::Int) - 1
-}
-
-fn test_compute_attn_mask(
-    graph: &mut Graph,
-    q_pos: GraphTensor,
-    qo_indptr: GraphTensor,
-    kv_indptr: GraphTensor,
-    c: Expression,
-) -> GraphTensor {
-    let s = q_pos.dims1();
-    let q_request = test_indptr_to_request_idx(graph, qo_indptr, s);
-    let c_request = test_indptr_to_request_idx(graph, kv_indptr, c);
-    let c_arange = graph.arange(c);
-    let c_kv_start = kv_indptr.gather(c_request);
-    let c_local_pos = c_arange - c_kv_start;
-    let q_req_2d = q_request.expand_dim(1, c);
-    let c_req_2d = c_request.expand_dim(0, s);
-    let same = q_req_2d.eq(c_req_2d);
-    let c_pos_2d = c_local_pos.expand_dim(0, s);
-    let qp_2d = q_pos.expand_dim(1, c);
-    let causal = c_pos_2d.le(qp_2d);
-    let allowed = same.cast(luminal::dtype::DType::F32) * causal.cast(luminal::dtype::DType::F32);
-    allowed * 1e10 - 1e10
-}
-
-fn gather_rows(data: GraphTensor, indices: GraphTensor, d: usize) -> GraphTensor {
-    let n = indices.dims1();
-    let base = (indices * d).expand_dim(1, d);
-    let col = data.graph().arange(d as i32).expand_dim(0, n);
-    data.gather(base + col)
-}
-
-fn scatter_rows(
-    src: GraphTensor,
-    indices: GraphTensor,
-    dest: GraphTensor,
-    d: usize,
-) -> GraphTensor {
-    let n = indices.dims1();
-    let base = (indices * d).expand_dim(1, d);
-    let col = src.graph().arange(d as i32).expand_dim(0, n);
-    src.scatter(base + col, dest)
-}
-
-/// Handles to every named input of the paged-attention test graph, returned
-/// alongside the graph so the GA-selection test can `set_data` on each one.
-#[allow(dead_code)]
-struct PagedAttnHandles {
-    q_rope: GraphTensor,
-    k_rope: GraphTensor,
-    v_new: GraphTensor,
-    k_cache: GraphTensor,
-    v_cache: GraphTensor,
-    scatter_idx: GraphTensor,
-    gather_idx: GraphTensor,
-    q_pos: GraphTensor,
-    qo_indptr: GraphTensor,
-    kv_indptr: GraphTensor,
-}
-
-/// Build a full paged-attention HLIR graph with the structural anchors the
-/// FlashInfer egglog rule looks for: scatter into a 2D cache, gather rows out
-/// by index, GQA broadcast via `Mul(..., 1.0)` with zero strides, Q*K^T → Sum
-/// → scale → mask Add → softmax → *V → Sum.
-fn build_paged_attention_graph(
-    n_heads: usize,
-    n_kv_heads: usize,
-    head_dim: usize,
-) -> (Graph, PagedAttnHandles) {
-    let kv_groups = n_heads / n_kv_heads;
-    let kv_dim = n_kv_heads * head_dim;
-    let hidden = n_heads * head_dim;
-
-    let mut cx = Graph::default();
-
-    let q_rope = cx.named_tensor("q_rope", ('s', hidden));
-    let k_rope = cx.named_tensor("k_rope", ('s', kv_dim));
-    let v_new = cx.named_tensor("v_new", ('s', kv_dim));
-    let k_cache = cx.named_tensor("k_cache", (2048, kv_dim)).persist();
-    let v_cache = cx.named_tensor("v_cache", (2048, kv_dim)).persist();
-    let scatter_idx = cx
-        .named_tensor("scatter_idx", 's')
-        .as_dtype(luminal::dtype::DType::Int);
-    let gather_idx = cx
-        .named_tensor("gather_idx", 'c')
-        .as_dtype(luminal::dtype::DType::Int);
-    let q_pos = cx
-        .named_tensor("q_pos", 's')
-        .as_dtype(luminal::dtype::DType::Int);
-    let qo_indptr = cx
-        .named_tensor("qo_indptr", 'r')
-        .as_dtype(luminal::dtype::DType::Int);
-    let kv_indptr = cx
-        .named_tensor("kv_indptr", 'r')
-        .as_dtype(luminal::dtype::DType::Int);
-
-    let k_cache_out = scatter_rows(k_rope, scatter_idx, k_cache, kv_dim);
-    let v_cache_out = scatter_rows(v_new, scatter_idx, v_cache, kv_dim);
-
-    let k = gather_rows(k_cache_out, gather_idx, kv_dim);
-    let v_ctx = gather_rows(v_cache_out, gather_idx, kv_dim);
-
-    let c: Expression = 'c'.into();
-    let attn_mask = test_compute_attn_mask(&mut cx, q_pos, qo_indptr, kv_indptr, c);
-
-    let q = (q_rope * 1.0).split_dims(1, head_dim).transpose(0, 1);
-    let k = k.split_dims(1, head_dim).permute((1, 2, 0));
-    let v_ctx = v_ctx.split_dims(1, head_dim).transpose(0, 1);
-    let k = k.expand_dim(1, kv_groups).merge_dims(0, 1) * 1.0;
-    let v_ctx = v_ctx.expand_dim(1, kv_groups).merge_dims(0, 1) * 1.0;
-
-    let scores = q.matmul(k) / (head_dim as f32).sqrt();
-    let mask = attn_mask.expand_dim(0, n_heads);
-    let masked_scores = scores + mask;
-    let weights = masked_scores.softmax(2);
-    let out = weights.matmul(v_ctx);
-    let attn_out = out.transpose(0, 1).merge_dims(1, 2);
-
-    attn_out.output();
-    k_cache_out.output();
-    v_cache_out.output();
-
-    (
-        cx,
-        PagedAttnHandles {
-            q_rope,
-            k_rope,
-            v_new,
-            k_cache,
-            v_cache,
-            scatter_idx,
-            gather_idx,
-            q_pos,
-            qo_indptr,
-            kv_indptr,
-        },
-    )
-}
-
-/// Saturate egglog on the graph and report whether a FlashInferAttention
-/// e-node was produced. Helper used by the rule-firing tests.
-fn saturate_and_has_flashinfer(cx: &Graph) -> (bool, Vec<String>) {
-    let (program, root) = hlir_to_egglog(cx);
-    let mut ops = <CudaRuntime as luminal::op::Runtime>::Ops::into_vec();
-    ops.extend(<luminal::hlir::HLIROps as IntoEgglogOp>::into_vec());
-    // cleanup=false: keep every saturation-introduced e-node so we can inspect
-    // whether the FlashInferAttention rule produced a node, regardless of
-    // whether downstream extraction would have pruned it.
-    let egraph = run_egglog(&program, &root, &ops, false).expect("egglog failed");
-
-    let has_flashinfer = egraph
-        .enodes
-        .values()
-        .any(|(label, _)| label == "FlashInferAttention");
-
-    // Collect distinct OpKind labels so a failure can print what *did* match.
-    let mut op_kinds: Vec<String> = egraph
-        .enodes
-        .values()
-        .filter(|(l, _)| {
-            !l.starts_with('(')
-                && ![
-                    "Op",
-                    "Input",
-                    "Output",
-                    "OutputJoin",
-                    "ICons",
-                    "INil",
-                    "ECons",
-                    "ENil",
-                    "MNum",
-                    "MVar",
-                    "MMul",
-                    "MDiv",
-                    "MIter",
-                ]
-                .contains(&l.as_str())
-        })
-        .map(|(l, _)| l.clone())
-        .collect();
-    op_kinds.sort();
-    op_kinds.dedup();
-
-    (has_flashinfer, op_kinds)
-}
-
-/// Debug aid: dump the egglog program and key e-graph metrics for the lite
-/// paged-attention test so we can see why the FlashInfer rule isn't matching.
-#[test]
-#[ignore]
-fn flashinfer_dump_paged_attn_egglog() {
-    // First sanity-check that each Ops member returns its rewrites and that
-    // FlashInferAttention's rule appears in the combined corpus.
-    let ops_vec = <CudaRuntime as luminal::op::Runtime>::Ops::into_vec();
-    eprintln!("==== Ops rewrites count ====");
-    let mut fi_rewrites = 0usize;
-    let mut total_rewrites = 0usize;
-    for op in &ops_vec {
-        let rws = op.rewrites();
-        total_rewrites += rws.len();
-        for r in &rws {
-            let s = format!("{r:?}");
-            if s.contains("FlashInferAttention") {
-                fi_rewrites += 1;
-                eprintln!("FOUND FlashInfer rewrite ({} chars)", s.len());
-            }
-        }
-    }
-    eprintln!(
-        "==== ops_vec.len()={} total_rewrites={total_rewrites} fi_rewrites={fi_rewrites} ====",
-        ops_vec.len()
-    );
-
-    let (cx, _) = build_paged_attention_graph(N_HEADS, N_KV_HEADS, HEAD_DIM);
-    let (program, root) = hlir_to_egglog(&cx);
-    eprintln!("==== EGGLOG PROGRAM (root={root}) ====");
-    for (i, line) in program.lines().enumerate() {
-        eprintln!("{:5}: {line}", i + 1);
-    }
-    eprintln!(
-        "==== END EGGLOG PROGRAM ({} lines) ====",
-        program.lines().count()
-    );
-
-    let mut ops = <CudaRuntime as luminal::op::Runtime>::Ops::into_vec();
-    ops.extend(<luminal::hlir::HLIROps as IntoEgglogOp>::into_vec());
-    let egraph = run_egglog(&program, &root, &ops, false).expect("egglog failed");
-
-    // Bucket enode labels by frequency.
-    let mut counts: std::collections::HashMap<String, usize> = Default::default();
-    for (label, _) in egraph.enodes.values() {
-        *counts.entry(label.clone()).or_default() += 1;
-    }
-    let mut sorted: Vec<_> = counts.iter().collect();
-    sorted.sort_by(|a, b| b.1.cmp(a.1));
-    eprintln!("==== E-GRAPH LABEL HISTOGRAM (top 60) ====");
-    for (label, n) in sorted.iter().take(60) {
-        eprintln!("  {n:6}  {label}");
-    }
-    let has_fi = egraph
-        .enodes
-        .values()
-        .any(|(label, _)| label == "FlashInferAttention");
-    eprintln!("==== has FlashInferAttention enode: {has_fi} ====");
-}
-
-#[test]
-fn flashinfer_rule_does_not_fire_on_bare_attention() {
-    // Dense attention without paged gather + cache should NOT match.
-    let (cx, _, _, _, _) = build_attention_graph();
-    let (has_flashinfer, _) = saturate_and_has_flashinfer(&cx);
-    assert!(
-        !has_flashinfer,
-        "FlashInferAttention should NOT fire on bare attention (no gather/cache)"
-    );
-}
-
-#[test]
-fn flashinfer_rule_does_not_fire_on_unrelated_matmuls() {
-    // A Gather + plain matmul (MLP-shaped projection) plus two chained matmuls
-    // through softmax — close to attention structurally but missing the GQA
-    // broadcast / mask Add anchors. The rule must reject this.
-    let mut cx = Graph::default();
-    let cache = cx.named_tensor("cache", (4096, KV_DIM)).persist();
-    let gather_idx = cx
-        .named_tensor("gather_idx", 'c')
-        .as_dtype(luminal::dtype::DType::Int);
-    let weight = cx.named_tensor("weight", (HIDDEN, KV_DIM)).persist();
-
-    let n = gather_idx.dims1();
-    let base = (gather_idx * KV_DIM).expand_dim(1, KV_DIM);
-    let col = cx.arange(KV_DIM as i32).expand_dim(0, n);
-    let gathered = cache.gather(base + col);
-    let proj = gathered.matmul(weight.t());
-    proj.output();
-
-    let a = cx.named_tensor("a", ('s', HIDDEN));
-    let b = cx.named_tensor("b", (HIDDEN, HIDDEN)).persist();
-    let c_tensor = cx.named_tensor("c_tensor", (HIDDEN, HIDDEN)).persist();
-    let ab = a.matmul(b.t());
-    let abc = ab.softmax(1).matmul(c_tensor.t());
-    abc.output();
-
-    let (has_flashinfer, _) = saturate_and_has_flashinfer(&cx);
-    assert!(
-        !has_flashinfer,
-        "FlashInferAttention should NOT fire on unrelated matmuls + Gather"
-    );
-}
-
-#[test]
-fn flashinfer_rule_fires_on_full_paged_attention() {
-    // Default Llama-shaped test dims (HEAD_DIM=64, N_HEADS=8, N_KV_HEADS=2).
-    let (cx, _) = build_paged_attention_graph(N_HEADS, N_KV_HEADS, HEAD_DIM);
-    let (has_flashinfer, op_kinds) = saturate_and_has_flashinfer(&cx);
-    assert!(
-        has_flashinfer,
-        "FlashInferAttention was NOT found in the e-graph (Llama-shaped paged attention). \
-         OpKinds present: {op_kinds:?}"
-    );
-}
-
-#[test]
-fn flashinfer_rule_fires_on_non_llama_dims() {
-    // Different head counts: HEAD_DIM=64, N_HEADS=16, N_KV_HEADS=4 (group=4).
-    // Exercises the model-agnostic structural variables in the rule.
-    let (cx, _) = build_paged_attention_graph(16, 4, 64);
-    let (has_flashinfer, op_kinds) = saturate_and_has_flashinfer(&cx);
-    assert!(
-        has_flashinfer,
-        "FlashInferAttention was NOT found for non-Llama dims. \
-         OpKinds present: {op_kinds:?}"
-    );
-}
-
-#[test]
-fn flashinfer_rule_fires_on_mha() {
-    // MHA: KV_GROUPS=1 (n_heads == n_kv_heads). The GQA broadcast still
-    // structurally appears (expand_dim(1, 1) + merge), so the rule should
-    // still match.
-    let (cx, _) = build_paged_attention_graph(12, 12, 64);
-    let (has_flashinfer, op_kinds) = saturate_and_has_flashinfer(&cx);
-    assert!(
-        has_flashinfer,
-        "FlashInferAttention was NOT found for MHA dims. \
-         OpKinds present: {op_kinds:?}"
-    );
-}
-
-// ─── Layer 5: extraction reachability (no GPU) ───────────────────────────
-//
-// After `build_search_space` saturates egglog, the GA picks an extraction by
-// cost. In a tiny test graph the cuBLAS+kernel path is often faster than the
-// FlashInfer host op (which pays a `plan()` setup cost per call), so asserting
-// "GA picked FlashInfer" is flaky. Instead, sample many random valid genomes
-// from the search space and assert that the FlashInfer extraction is reachable
-// — meaning the rule fired AND `find_indptrs` extraction succeeded for at
-// least one offspring. That is the end-to-end check we actually want.
-
-#[test]
-fn flashinfer_extraction_reachable_from_search_space() {
-    use rand::SeedableRng;
-    use rand::rngs::StdRng;
-
-    let (mut cx, _h) = build_paged_attention_graph(N_HEADS, N_KV_HEADS, HEAD_DIM);
-    cx.set_dim('s', 1usize);
-    cx.set_dim('c', 16usize);
-    cx.set_dim('r', 2usize);
-    cx.build_search_space::<CudaRuntime>();
-
-    let egraph = cx
-        .egraph()
-        .expect("egraph missing after build_search_space");
-    let ops = cx
-        .egglog_ops()
-        .expect("egglog_ops missing after build_search_space");
-
-    let mut rng = StdRng::seed_from_u64(0xf1a541);
-    let mut prev: FxHashSet<u64> = FxHashSet::default();
-    let initial = luminal::egglog_utils::random_initial_choice(egraph, &mut rng);
-    prev.insert(luminal::egglog_utils::hash_choice_set(&initial));
-    let mut base = initial;
-
-    let mut found = false;
-    'outer: for _ in 0..50 {
-        let offspring =
-            luminal::egglog_utils::extract_generation(egraph, &base, 10, 2, &mut prev, &mut rng);
-        if offspring.is_empty() {
-            break;
-        }
-        for genome in offspring {
-            if luminal::egglog_utils::validate_choice_set(egraph, &genome, ops).is_err() {
-                continue;
-            }
-            let mut list_cache = FxHashMap::default();
-            let mut expr_cache = FxHashMap::default();
-            // Catch a possible panic from find_indptrs walking the mask — we
-            // want the test to fail with a clean message, not abort.
-            let panicked = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
-                luminal::egglog_utils::egglog_to_llir(
-                    egraph,
-                    genome.clone(),
-                    ops,
-                    &cx.custom_ops,
-                    &mut list_cache,
-                    &mut expr_cache,
-                    None,
-                )
-            }));
-            let Ok(llir_graph) = panicked else { continue };
-
-            let has_fi = llir_graph.node_indices().any(|n| {
-                llir_graph[n]
-                    .to_dialect::<dyn HostOp>()
-                    .and_then(|op| op.stats_name())
-                    == Some("FlashInferAttention")
-            });
-            if has_fi {
-                found = true;
-                break 'outer;
-            }
-            base = genome;
-        }
-    }
-    assert!(
-        found,
-        "FlashInferAttention extraction not reachable from search space after 50 generations"
-    );
-}
--- a/crates/luminal_cuda_lite/src/tests/fusion.rs
+++ b/crates/luminal_cuda_lite/src/tests/fusion.rs
@@ -1,9 +1,7 @@
-use as_any::Downcast;
 use luminal::egglog_utils::{egglog_to_llir, random_initial_choice};
 use luminal::prelude::*;

 use crate::kernel::KernelOp;
-use crate::kernel::fusion::{CudaBinaryElementwise, CudaUnaryElementwise};
 use crate::runtime::CudaRuntime;
 use crate::tests::utilities::{
    TOLERANCE_SAFETY_FACTOR, dtype_epsilon, random_f32_vec, test_binary_cuda, test_unary_cuda,
@@ -88,7 +86,7 @@ fn test_unary_fusion_preserves_output() {
 #[test]
 fn test_three_unary_ops_fuse() {
    // A chain of 3 pure-elementwise unaries with matching strides should be
-    // reachable as a single marker region containing all three elementwise ops.
+    // reachable as a single marker region containing all three FusedX ops.
    let mut cx = Graph::new();
    let a = cx.tensor(16);
    let _b = a.sin().sqrt().exp2().output();
@@ -106,7 +104,7 @@ fn test_three_unary_ops_fuse() {
 #[test]
 fn test_four_unary_ops_fuse() {
    // 4-op chain should collapse into a single marker region containing all
-    // four elementwise ops (one pair-fuse + repeated grow-FE→U firings).
+    // four FusedX ops (one pair-fuse + repeated grow-FE→U firings).
    let mut cx = Graph::new();
    let a = cx.tensor(16);
    let _b = a.sin().sqrt().exp2().log2().output();
@@ -319,15 +317,8 @@ fn extract_all_fused_regions(cx: &mut Graph) -> Vec<FusedRegion> {

        let name_of = |idx: NodeIndex| -> Option<String> {
            llir.node_weight(idx).and_then(|op| {
-                op.to_dialect::<dyn KernelOp>().map(|k| {
-                    if let Some(elem) = (***k).downcast_ref::<CudaUnaryElementwise>() {
-                        format!("Fused{}", elem.op)
-                    } else if let Some(elem) = (***k).downcast_ref::<CudaBinaryElementwise>() {
-                        format!("Fused{}", elem.op)
-                    } else {
-                        k.kernel_name().to_string()
-                    }
-                })
+                op.to_dialect::<dyn KernelOp>()
+                    .map(|k| k.kernel_name().to_string())
            })
        };

@@ -352,13 +343,12 @@ fn extract_all_fused_regions(cx: &mut Graph) -> Vec<FusedRegion> {

            // Resolve chains of nested FusionStart wrappers (cascade artifact)
            // to the real external source. A FusionStart whose incoming neighbor
-            // is itself a FusionStart is a cascade layer, not a new external
-            // tensor. A FusionEnd predecessor is a real external region output
-            // in the generic singleton-region model, so do not walk through it.
+            // is itself a FusionStart — or a FusionEnd whose region is fully
+            // inside ours — is a cascade layer, not a new external tensor.
            let resolve_source = |mut n: NodeIndex| -> NodeIndex {
                loop {
                    match name_of(n).as_deref() {
-                        Some("FusionStart") => {
+                        Some("FusionStart") | Some("FusionEnd") => {
                            let mut inc = llir.neighbors_directed(n, petgraph::Direction::Incoming);
                            match inc.next() {
                                Some(p) => n = p,
@@ -389,6 +379,15 @@ fn extract_all_fused_regions(cx: &mut Graph) -> Vec<FusedRegion> {
                            let mut inc =
                                llir.neighbors_directed(pred, petgraph::Direction::Incoming);
                            match inc.next() {
+                                Some(src_node)
+                                    if name_of(src_node).as_deref() == Some("FusionEnd") =>
+                                {
+                                    // Merge adjacent regions — treat the FS/FE
+                                    // pair as internal; walk past the upstream
+                                    // FE into its region.
+                                    visited.insert(src_node);
+                                    stack.push(src_node);
+                                }
                                Some(src_node) => {
                                    start_sources.insert(resolve_source(src_node));
                                }
@@ -468,15 +467,6 @@ fn test_single_binary_does_not_fuse_alone() {
 fn test_chain_of_binaries_fuses() {
    // `(a + b) * c`: three external inputs collapse into one region with
    // internal [Add, Mul] and 3 FusionStarts.
-    //
-    // Requires BB family, which is opt-in at runtime via
-    // LUMINAL_FUSION_FAMILIES. Set it before the graph build so the rules
-    // emitted from FusionEnd::rewrites include the B-B pair-fuse rules.
-    // SAFETY: tests run in parallel; we set this before constructing the
-    // Graph, and never unset, so concurrent tests just see BB on.
-    unsafe {
-        std::env::set_var("LUMINAL_FUSION_FAMILIES", "uu,bu,ub,bb");
-    }
    let mut cx = Graph::new();
    let a = cx.tensor(8);
    let b = cx.tensor(8);
@@ -530,13 +520,6 @@ fn test_unary_then_binary_fuses() {
 }

 #[test]
-// Subsume in grow rules (introduced to bound the BB partial-FE explosion)
-// means a multi-consumer producer can no longer be fused into the same
-// region as all its consumers — only one branch wins. The diamond's `t`
-// has two consumers, so the structural "one 5-op region" outcome is no
-// longer guaranteed. Numerical correctness still holds (see
-// test_diamond_dag_preserves_output).
-#[ignore = "asserts pre-subsume ideal multi-consumer fusion shape"]
 fn test_diamond_dag_fuses() {
    // The canonical diamond-DAG example agreed with the user:
    //   t = a + b; u = exp2(t); v = sin(t); w = u * a; out = w + v
@@ -667,7 +650,6 @@ fn test_diamond_dag_preserves_output() {
 // ---- Marker invariant tests ----

 #[test]
-#[ignore = "asserts pre-subsume ideal multi-consumer fusion shape"]
 fn test_fused_region_has_exactly_one_end() {
    // Design invariant: a fused region always has exactly one FusionEnd.
    // Uses the diamond DAG so there's real fan-in/out inside the region.
@@ -695,7 +677,6 @@ fn test_fused_region_has_exactly_one_end() {
 }

 #[test]
-#[ignore = "asserts pre-subsume ideal multi-consumer fusion shape"]
 fn test_fused_region_starts_match_distinct_external_tensors() {
    // Design invariant: FusionStart count == number of distinct external input
    // tensors, NOT number of edges crossing the boundary. In the diamond DAG
@@ -787,10 +768,6 @@ fn test_pair_fuse_binary_to_binary_rhs() {
    // Pair-fuse B→B (RHS variant): `c * (a + b)`. The inner binary feeds the
    // outer binary's B input, exercising the mirror direction of the rule
    // covered by test_chain_of_binaries_fuses.
-    // See test_chain_of_binaries_fuses for the LUMINAL_FUSION_FAMILIES note.
-    unsafe {
-        std::env::set_var("LUMINAL_FUSION_FAMILIES", "uu,bu,ub,bb");
-    }
    let mut cx = Graph::new();
    let a = cx.tensor(8);
    let b = cx.tensor(8);
@@ -832,7 +809,6 @@ fn test_grow_fe_to_binary_rhs() {
 }

 #[test]
-#[ignore = "asserts pre-subsume two-FE merge shape; numerical correctness preserved"]
 fn test_merge_two_regions_at_outer_binary() {
    // Merge: `(sin(a) + b) + (sqrt(c) + d)`. Each side independently pair-fuses
    // U→B on its own (the unary gives the inner Add a fusion partner that
--- a/crates/luminal_cuda_lite/src/tests/mod.rs
+++ b/crates/luminal_cuda_lite/src/tests/mod.rs
@@ -5,10 +5,6 @@ mod bucket_tests;
 #[cfg(test)]
 mod consumed_buffer_tests;
 #[cfg(test)]
-mod cublaslt_rewrite_tests;
-#[cfg(test)]
-mod flashinfer;
-#[cfg(test)]
 mod fusion;
 #[cfg(test)]
 mod model_fuzz;
@@ -19,8 +15,4 @@ mod performance_tests;
 #[cfg(test)]
 mod qwen3_moe_rewrite;
 #[cfg(test)]
-mod rope_test;
-#[cfg(test)]
-mod search_equivalence_fuzz;
-#[cfg(test)]
 mod transformer;
--- a/crates/luminal_cuda_lite/src/tests/model_fuzz.rs
+++ b/crates/luminal_cuda_lite/src/tests/model_fuzz.rs
@@ -1,12 +1,7 @@
 //! Fuzz tests for model-architecture-specific subgraphs (Llama, Gemma, Qwen).
 //!
 //! Tests many random e-graph extraction variants (genomes) against a candle CPU
-//! reference to catch incorrect HLIR kernel rewrites.
-//!
-//! These are marked ignored by default because each test builds a model-shaped
-//! graph and checks many extraction genomes. Run them explicitly with
-//! `cargo test -p luminal_cuda_lite -- --ignored` when touching extraction,
-//! scheduling, or model-pattern rewrites.
+//! reference to catch incorrect HLIR kernel fallback rewrites.

 use luminal::prelude::*;

@@ -305,7 +300,7 @@ fn fuzz_layer_no_attn(
 }

 /// Test a SwiGLU MLP with HLIR-only to specifically verify
-/// the HLIR matmul decomposition (elementwise Mul + KernelSumReduce).
+/// the HLIR matmul decomposition (KernelMul + KernelSumReduce).
 fn fuzz_mlp_hlir_only(seq: usize, hidden: usize, intermediate: usize, seed: u64) {
    let Some(stream) = get_cuda_stream() else {
        return;
@@ -382,38 +377,32 @@ mod llama {
    const EPS: f32 = 1e-5;

    #[test]
-    #[ignore = "expensive CUDA model genome fuzzing; run with cargo test -p luminal_cuda_lite -- --ignored"]
    fn fuzz_llama_mlp() {
        fuzz_mlp(SEQ, HIDDEN, INTERMEDIATE, 42);
    }

    #[test]
-    #[ignore = "expensive CUDA model genome fuzzing; run with cargo test -p luminal_cuda_lite -- --ignored"]
    fn fuzz_llama_norm_proj() {
        fuzz_norm_proj(SEQ, HIDDEN, PROJ_DIM, EPS, 100);
    }

    #[test]
-    #[ignore = "expensive CUDA model genome fuzzing; run with cargo test -p luminal_cuda_lite -- --ignored"]
    fn fuzz_llama_layer() {
        fuzz_layer_no_attn(SEQ, HIDDEN, INTERMEDIATE, PROJ_DIM, EPS, 200);
    }

    #[test]
-    #[ignore = "expensive CUDA model genome fuzzing; run with cargo test -p luminal_cuda_lite -- --ignored"]
    fn fuzz_llama_mlp_seq1() {
        fuzz_mlp(1, HIDDEN, INTERMEDIATE, 300);
    }

    #[test]
-    #[ignore = "expensive CUDA model genome fuzzing; run with cargo test -p luminal_cuda_lite -- --ignored"]
    fn fuzz_llama_mlp_seq7() {
        fuzz_mlp(7, HIDDEN, INTERMEDIATE, 400);
    }

-    /// Force HLIR-only (no block ops) to specifically test that extraction path.
+    /// Force HLIR-only (no block ops) to specifically test the fallback path.
    #[test]
-    #[ignore = "expensive CUDA model genome fuzzing; run with cargo test -p luminal_cuda_lite -- --ignored"]
    fn fuzz_llama_mlp_hlir_only() {
        fuzz_mlp_hlir_only(SEQ, HIDDEN, INTERMEDIATE, 450);
    }
@@ -435,26 +424,22 @@ mod gemma {
    const EPS: f32 = 1e-6;

    #[test]
-    #[ignore = "expensive CUDA model genome fuzzing; run with cargo test -p luminal_cuda_lite -- --ignored"]
    fn fuzz_gemma_mlp() {
        fuzz_mlp(SEQ, HIDDEN, INTERMEDIATE, 500);
    }

    #[test]
-    #[ignore = "expensive CUDA model genome fuzzing; run with cargo test -p luminal_cuda_lite -- --ignored"]
    fn fuzz_gemma_norm_proj() {
        fuzz_norm_proj(SEQ, HIDDEN, Q_DIM, EPS, 600);
    }

    #[test]
-    #[ignore = "expensive CUDA model genome fuzzing; run with cargo test -p luminal_cuda_lite -- --ignored"]
    fn fuzz_gemma_layer() {
        fuzz_layer_no_attn(SEQ, HIDDEN, INTERMEDIATE, Q_DIM, EPS, 700);
    }

    /// Gemma has extra post-attention and post-feedforward norms.
    #[test]
-    #[ignore = "expensive CUDA model genome fuzzing; run with cargo test -p luminal_cuda_lite -- --ignored"]
    fn fuzz_gemma_layer_full_norms() {
        let Some(stream) = get_cuda_stream() else {
            return;
@@ -579,14 +564,12 @@ mod gemma {
    }

    #[test]
-    #[ignore = "expensive CUDA model genome fuzzing; run with cargo test -p luminal_cuda_lite -- --ignored"]
    fn fuzz_gemma_mlp_seq1() {
        fuzz_mlp(1, HIDDEN, INTERMEDIATE, 900);
    }

-    /// Force HLIR-only to test that extraction path with Gemma dimensions.
+    /// Force HLIR-only to test fallback path with Gemma dimensions.
    #[test]
-    #[ignore = "expensive CUDA model genome fuzzing; run with cargo test -p luminal_cuda_lite -- --ignored"]
    fn fuzz_gemma_mlp_hlir_only() {
        fuzz_mlp_hlir_only(SEQ, HIDDEN, INTERMEDIATE, 950);
    }
@@ -608,26 +591,22 @@ mod qwen {
    const EPS: f32 = 1e-6;

    #[test]
-    #[ignore = "expensive CUDA model genome fuzzing; run with cargo test -p luminal_cuda_lite -- --ignored"]
    fn fuzz_qwen_mlp() {
        fuzz_mlp(SEQ, HIDDEN, INTERMEDIATE, 1000);
    }

    #[test]
-    #[ignore = "expensive CUDA model genome fuzzing; run with cargo test -p luminal_cuda_lite -- --ignored"]
    fn fuzz_qwen_norm_proj() {
        fuzz_norm_proj(SEQ, HIDDEN, Q_DIM, EPS, 1100);
    }

    #[test]
-    #[ignore = "expensive CUDA model genome fuzzing; run with cargo test -p luminal_cuda_lite -- --ignored"]
    fn fuzz_qwen_layer() {
        fuzz_layer_no_attn(SEQ, HIDDEN, INTERMEDIATE, Q_DIM, EPS, 1200);
    }

    /// Qwen uses tied embeddings: lm_head = embedding^T
    #[test]
-    #[ignore = "expensive CUDA model genome fuzzing; run with cargo test -p luminal_cuda_lite -- --ignored"]
    fn fuzz_qwen_lm_head() {
        let Some(stream) = get_cuda_stream() else {
            return;
@@ -689,20 +668,17 @@ mod qwen {
    }

    #[test]
-    #[ignore = "expensive CUDA model genome fuzzing; run with cargo test -p luminal_cuda_lite -- --ignored"]
    fn fuzz_qwen_mlp_seq1() {
        fuzz_mlp(1, HIDDEN, INTERMEDIATE, 1400);
    }

    #[test]
-    #[ignore = "expensive CUDA model genome fuzzing; run with cargo test -p luminal_cuda_lite -- --ignored"]
    fn fuzz_qwen_mlp_seq7() {
        fuzz_mlp(7, HIDDEN, INTERMEDIATE, 1500);
    }

-    /// Force HLIR-only to test that extraction path with Qwen dimensions.
+    /// Force HLIR-only to test fallback path with Qwen dimensions.
    #[test]
-    #[ignore = "expensive CUDA model genome fuzzing; run with cargo test -p luminal_cuda_lite -- --ignored"]
    fn fuzz_qwen_mlp_hlir_only() {
        fuzz_mlp_hlir_only(SEQ, HIDDEN, INTERMEDIATE, 1550);
    }
--- a/crates/luminal_cuda_lite/src/tests/op_functional_tests.rs
+++ b/crates/luminal_cuda_lite/src/tests/op_functional_tests.rs
@@ -16,16 +16,9 @@ use super::utilities::{
    test_binary_cuda, test_mod, test_unary_cuda, to_candle_dtype,
 };

-// The property-based op tests each build/search CUDA graphs for multiple random
-// shapes. They are ignored by default to keep the main CUDA unit suite short;
-// run `cargo test -p luminal_cuda_lite -- --ignored` for the broader sweeps.
-
 proptest! {
    #![proptest_config(ProptestConfig::with_cases(5))]

-    #[ignore = "expensive CUDA op proptest sweep; run with cargo test -p luminal_cuda_lite -- --ignored"]
-
-
    #[test]
    fn test_add(x in 1usize..100, y in 1usize..5, seed in any::<u64>()) {
        let gen_lambda = |n, s| random_f32_vec(n, s, -0.5, 0.5);
@@ -35,9 +28,6 @@ proptest! {
        test_binary_cuda((y, x), (y, x), |a, b| a + b, |a, b| (&a + &b).unwrap(), gen_lambda, gen_lambda, seed, rtol, atol);
    }

-    #[ignore = "expensive CUDA op proptest sweep; run with cargo test -p luminal_cuda_lite -- --ignored"]
-
-
    #[test]
    fn test_mul(x in 1usize..100, y in 1usize..5, seed in any::<u64>()) {
        let gen_lambda = |n, s| random_f32_vec(n, s, -0.5, 0.5);
@@ -47,27 +37,18 @@ proptest! {
        test_binary_cuda((y, x), (y, x), |a, b| a * b, |a, b| (&a * &b).unwrap(), gen_lambda, gen_lambda, seed, rtol, atol);
    }

-    #[ignore = "expensive CUDA op proptest sweep; run with cargo test -p luminal_cuda_lite -- --ignored"]
-
-
    #[test]
    fn test_max(rows in 1usize..8, cols in 1usize..8, seed in any::<u64>()) {
        let gen_lambda = |n, s| random_f32_vec(n, s, -0.5, 0.5);
        test_unary_cuda((rows, cols), |a| a.max(1), |a| a.max(1).unwrap(), gen_lambda, seed);
    }

-    #[ignore = "expensive CUDA op proptest sweep; run with cargo test -p luminal_cuda_lite -- --ignored"]
-
-
    #[test]
    fn test_mean(rows in 1usize..8, cols in 1usize..8, seed in any::<u64>()) {
        let gen_lambda = |n, s| random_f32_vec(n, s, -0.5, 0.5);
        test_unary_cuda((rows, cols), |a| a.mean(1), |a| a.mean(1).unwrap(), gen_lambda, seed);
    }

-    #[ignore = "expensive CUDA op proptest sweep; run with cargo test -p luminal_cuda_lite -- --ignored"]
-
-
    #[test]
    fn test_matmul(
        (m, n, k, a_col_major, b_col_major, m_slice, k_slice, n_slice, dtype) in
@@ -138,8 +119,6 @@ proptest! {
    }

    // Unary ops tests
-    #[ignore = "expensive CUDA op proptest sweep; run with cargo test -p luminal_cuda_lite -- --ignored"]
-
    #[test]
    fn test_exp2(x in 1usize..100, y in 1usize..5, seed in any::<u64>()) {
        // exp2(x) = 2^x, verified by computing 2^x using exp(x * ln(2))
@@ -148,9 +127,6 @@ proptest! {
        test_unary_cuda((y, x), |a| a.exp2(), |a| (a * 2.0f64.ln()).unwrap().exp().unwrap(), gen_lambda, seed);
    }

-    #[ignore = "expensive CUDA op proptest sweep; run with cargo test -p luminal_cuda_lite -- --ignored"]
-
-
    #[test]
    fn test_log2(x in 1usize..100, y in 1usize..5, seed in any::<u64>()) {
        // log2(x) = ln(x) / ln(2)
@@ -159,9 +135,6 @@ proptest! {
        test_unary_cuda((y, x), |a| a.log2(), |a| (a.log().unwrap() / 2.0f64.ln()).unwrap(), gen_lambda, seed);
    }

-    #[ignore = "expensive CUDA op proptest sweep; run with cargo test -p luminal_cuda_lite -- --ignored"]
-
-
    #[test]
    fn test_sin(x in 1usize..100, y in 1usize..5, seed in any::<u64>()) {
        let gen_lambda = |n, s| random_f32_vec(n, s, -0.5, 0.5);
@@ -169,9 +142,6 @@ proptest! {
        test_unary_cuda((y, x), |a| a.sin(), |a| a.sin().unwrap(), gen_lambda, seed);
    }

-    #[ignore = "expensive CUDA op proptest sweep; run with cargo test -p luminal_cuda_lite -- --ignored"]
-
-
    #[test]
    fn test_recip(x in 1usize..100, y in 1usize..5, seed in any::<u64>()) {
        let gen_lambda = |n, s| random_f32_vec(n, s, 0.1, 0.5);
@@ -179,9 +149,6 @@ proptest! {
        test_unary_cuda((y, x), |a| a.reciprocal(), |a| a.recip().unwrap(), gen_lambda, seed);
    }

-    #[ignore = "expensive CUDA op proptest sweep; run with cargo test -p luminal_cuda_lite -- --ignored"]
-
-
    #[test]
    fn test_sqrt(x in 1usize..100, y in 1usize..5, seed in any::<u64>()) {
        let gen_lambda = |n, s| random_f32_vec(n, s, 0.1, 0.6);
@@ -190,17 +157,12 @@ proptest! {
    }

    // Binary ops tests
-    #[ignore = "expensive CUDA op proptest sweep; run with cargo test -p luminal_cuda_lite -- --ignored"]
-
    #[test]
    fn test_mod_op(x in 1usize..100, y in 1usize..5, seed in any::<u64>()) {
        test_mod(x, x, |a, b| a % b, seed);
        test_mod((y, x), (y, x), |a, b| a % b, seed);
    }

-    #[ignore = "expensive CUDA op proptest sweep; run with cargo test -p luminal_cuda_lite -- --ignored"]
-
-
    #[test]
    fn test_less_than(x in 1usize..100, y in 1usize..5, seed in any::<u64>()) {
        let gen_lambda = |n, s| random_f32_vec(n, s, -99.0, 100.0).into_iter().map(|v| v.floor()).collect();
@@ -373,8 +335,6 @@ proptest! {
    #![proptest_config(ProptestConfig::with_cases(5))]

    /// Test F32 -> F16 -> F32 cast roundtrip with random values.
-    #[ignore = "expensive CUDA op proptest sweep; run with cargo test -p luminal_cuda_lite -- --ignored"]
-
    #[test]
    fn test_cast_f16_random(size in 1usize..200, seed in any::<u64>()) {
        use luminal::dtype::DType;
@@ -567,9 +527,6 @@ fn fuzz_test_cuda_genomes_impl(seed: u64) {
 proptest! {
    #![proptest_config(ProptestConfig::with_cases(3))]

-    // This walks random extraction genomes and is intentionally opt-in so the
-    // default CUDA unit suite keeps a tight feedback loop.
-    #[ignore = "expensive CUDA genome fuzzing; run with cargo test -p luminal_cuda_lite -- --ignored"]
    #[test]
    fn fuzz_test_cuda_genomes(seed in any::<u64>()) {
        fuzz_test_cuda_genomes_impl(seed);
@@ -637,9 +594,6 @@ fn run_embed_test(vocab_size: usize, embed_dim: usize, seq_len: usize, seed: u64
 proptest! {
    #![proptest_config(ProptestConfig::with_cases(5))]

-    #[ignore = "expensive CUDA op proptest sweep; run with cargo test -p luminal_cuda_lite -- --ignored"]
-
-
    #[test]
    fn test_embed_proptest(
        vocab_size in 10usize..200,
--- a/crates/luminal_cuda_lite/src/tests/performance_tests.rs
+++ b/crates/luminal_cuda_lite/src/tests/performance_tests.rs
@@ -6,7 +6,7 @@ use crate::cuda_bandwidth_gbps;
 use crate::runtime::CudaRuntime;

 /// Test that measures bandwidth utilization for a large element-wise add kernel.
-/// This demonstrates that generic fused Add can achieve reasonable bandwidth with large tensors.
+/// This demonstrates that KernelAdd can achieve reasonable bandwidth with large tensors.
 #[test]
 pub fn kernel_add_bandwidth_test() {
    // 64M elements = 256MB per tensor, 768MB total memory traffic (2 reads + 1 write)
@@ -40,7 +40,7 @@ pub fn kernel_add_bandwidth_test() {
    rt.execute(&cx.dyn_map);

    // Print stats
-    println!("\n=== Large Fused Add Bandwidth Test ===");
+    println!("\n=== Large KernelAdd Bandwidth Test ===");
    println!(
        "Tensor size: {} elements ({} MB per tensor)",
        size,
--- a/crates/luminal_cuda_lite/src/tests/qwen3_moe_rewrite.rs
+++ b/crates/luminal_cuda_lite/src/tests/qwen3_moe_rewrite.rs
@@ -3,15 +3,18 @@ use luminal::{dtype::DType, prelude::*, shape::Expression};

 use super::utilities::{assert_close, get_cuda_stream, random_f32_vec};
 use crate::{
-    host::moe::{GLUMoE, GLUMoEMode},
+    host::{
+        HostOp,
+        moe::{GLUMoE, GLUMoEMode},
+    },
    runtime::CudaRuntime,
 };

 const SEQ: usize = 2;
-const HIDDEN: usize = 32;
+const HIDDEN: usize = 16;
 const NUM_EXPERTS: usize = 8;
 const TOP_K: usize = 2;
-const MOE_INTERMEDIATE: usize = 12;
+const MOE_INTERMEDIATE: usize = 6;
 const RMS_NORM_EPS: f32 = 1e-6;

 struct QwenMoeGraph {
@@ -58,7 +61,6 @@ fn build_qwen_moe_graph() -> QwenMoeGraph {
        .iota(Expression::from('z') / k_expr * e_dim, top_k_indices.dims());
    let routing_flat_idx = row_offsets + top_k_indices;
    let top_k_values = routing_weights.gather(routing_flat_idx);
-    let top_k_values = top_k_values / top_k_values.sum(n - 1).expand_dim(n - 1, TOP_K);

    let gate_up_gathered = gather_experts(x, top_k_indices, gate_up_weights).cast(DType::F32);
    let x_exp = x.expand_dim(n - 1, TOP_K).unsqueeze(n);
@@ -72,9 +74,9 @@ fn build_qwen_moe_graph() -> QwenMoeGraph {
        .unsqueeze(2)
        .matmul(down_gathered.transpose(2, 3))
        .squeeze(2);
-    let mut weights_exp = top_k_values.unsqueeze(top_k_values.dims().len());
-    weights_exp.shape.expand(down_out.dims());
-    let output = (down_out * weights_exp).sum(n - 1).output();
+    let output = (down_out * top_k_values.unsqueeze(top_k_values.dims().len()))
+        .sum(n - 1)
+        .output();

    QwenMoeGraph {
        graph: cx,
@@ -131,9 +133,9 @@ fn build_gemma_moe_graph() -> GemmaMoeGraph {
        .unsqueeze(2)
        .matmul(down_gathered.transpose(2, 3))
        .squeeze(2);
-    let mut weights_exp = top_k_weights.unsqueeze(top_k_weights.dims().len());
-    weights_exp.shape.expand(down_out.dims());
-    let output = (down_out * weights_exp).sum(n - 1).output();
+    let output = (down_out * top_k_weights.unsqueeze(top_k_weights.dims().len()))
+        .sum(n - 1)
+        .output();

    GemmaMoeGraph {
        graph: cx,
@@ -174,9 +176,10 @@ fn gemma_gelu(x: GraphTensor) -> GraphTensor {
 }

 fn glumoe_modes(rt: &CudaRuntime) -> Vec<GLUMoEMode> {
-    rt.host_ops()
-        .into_iter()
-        .filter_map(|op| {
+    rt.llir_graph()
+        .node_weights()
+        .filter_map(|node| {
+            let op = node.to_dialect::<dyn HostOp>()?;
            op.as_any()
                .downcast_ref::<GLUMoE>()
                .map(|glumoe| glumoe.mode)
@@ -271,7 +274,7 @@ fn test_glumoe_matches_qwen_swiglu_pattern() {
        return;
    }

-    assert_eq!(modes, vec![GLUMoEMode::SwiGLUNormalized]);
+    assert_eq!(modes, vec![GLUMoEMode::SwiGLU]);
 }

 #[test]
@@ -293,7 +296,7 @@ fn test_glumoe_swiglu_matches_unfused_output() {
    assert!(baseline_modes.is_empty());

    let (actual, fused_modes) = run_qwen_moe(true);
-    assert_eq!(fused_modes, vec![GLUMoEMode::SwiGLUNormalized]);
+    assert_eq!(fused_modes, vec![GLUMoEMode::SwiGLU]);
    assert_close(&actual, &expected, 3e-2, 3e-2);
 }

--- a/crates/luminal_cuda_lite/src/tests/rope_test.rs
+++ b/crates/luminal_cuda_lite/src/tests/rope_test.rs
@@ -1,112 +0,0 @@
-use cudarc::driver::CudaContext;
-use luminal::{graph::Graph, op::Runtime};
-
-use crate::{kernel::apply_rope, runtime::CudaRuntime};
-
-fn cpu_rope(x: &[f32], cos: &[f32], sin: &[f32], s: usize, h: usize, d: usize) -> Vec<f32> {
-    assert!(d.is_multiple_of(2));
-    let mut out = vec![0.0f32; s * h * d];
-    for si in 0..s {
-        for hi in 0..h {
-            for i in 0..d {
-                let xi = x[si * h * d + hi * d + i];
-                let xpair = if i % 2 == 0 {
-                    -x[si * h * d + hi * d + i + 1]
-                } else {
-                    x[si * h * d + hi * d + i - 1]
-                };
-                let c = cos[si * d + i];
-                let sn = sin[si * d + i];
-                out[si * h * d + hi * d + i] = xi * c + xpair * sn;
-            }
-        }
-    }
-    out
-}
-
-#[test]
-fn rope_matches_cpu_reference() {
-    let s = 8;
-    let h = 4;
-    let d = 32;
-    let mut cx = Graph::default();
-    let x = cx.tensor((s, h, d));
-    let cos = cx.tensor((s, d));
-    let sin = cx.tensor((s, d));
-    let y = apply_rope(x, cos, sin).output();
-
-    let x_data: Vec<f32> = (0..s * h * d).map(|i| ((i as f32) * 0.013).sin()).collect();
-    let cos_data: Vec<f32> = (0..s * d).map(|i| ((i as f32) * 0.017).cos()).collect();
-    let sin_data: Vec<f32> = (0..s * d).map(|i| ((i as f32) * 0.017).sin()).collect();
-
-    let ctx = CudaContext::new(0).unwrap();
-    ctx.bind_to_thread().unwrap();
-    let stream = ctx.default_stream();
-    cx.build_search_space::<CudaRuntime>();
-    let mut rt = CudaRuntime::initialize(stream);
-    rt.set_data(x, x_data.clone());
-    rt.set_data(cos, cos_data.clone());
-    rt.set_data(sin, sin_data.clone());
-    rt = cx.search(rt, 1);
-    rt.execute(&cx.dyn_map);
-    let got = rt.get_f32(y.id);
-
-    let expected = cpu_rope(&x_data, &cos_data, &sin_data, s, h, d);
-    let mut max_err = 0.0f32;
-    for (g, e) in got.iter().zip(expected.iter()) {
-        let err = (g - e).abs();
-        if err > max_err {
-            max_err = err;
-        }
-    }
-    eprintln!("rope: max abs err: {max_err}");
-    assert!(max_err < 1e-5, "max abs error {max_err} too high");
-}
-
-#[test]
-fn rope_flux2_shape() {
-    // Flux 2 transformer attention: S=1536 (img+txt), H=48, D=128.
-    let s = 1536;
-    let h = 48;
-    let d = 128;
-    let mut cx = Graph::default();
-    let x = cx.tensor((s, h, d));
-    let cos = cx.tensor((s, d));
-    let sin = cx.tensor((s, d));
-    let y = apply_rope(x, cos, sin).output();
-
-    use rand::{Rng, SeedableRng};
-    let mut rng = rand::rngs::SmallRng::seed_from_u64(11);
-    let x_data: Vec<f32> = (0..s * h * d)
-        .map(|_| rng.random_range(-2.0..2.0_f32))
-        .collect();
-    let cos_data: Vec<f32> = (0..s * d)
-        .map(|_| rng.random_range(-1.0..1.0_f32))
-        .collect();
-    let sin_data: Vec<f32> = (0..s * d)
-        .map(|_| rng.random_range(-1.0..1.0_f32))
-        .collect();
-
-    let ctx = CudaContext::new(0).unwrap();
-    ctx.bind_to_thread().unwrap();
-    let stream = ctx.default_stream();
-    cx.build_search_space::<CudaRuntime>();
-    let mut rt = CudaRuntime::initialize(stream);
-    rt.set_data(x, x_data.clone());
-    rt.set_data(cos, cos_data.clone());
-    rt.set_data(sin, sin_data.clone());
-    rt = cx.search(rt, 1);
-    rt.execute(&cx.dyn_map);
-    let got = rt.get_f32(y.id);
-
-    let expected = cpu_rope(&x_data, &cos_data, &sin_data, s, h, d);
-    let mut max_err = 0.0f32;
-    for (g, e) in got.iter().zip(expected.iter()) {
-        let err = (g - e).abs();
-        if err > max_err {
-            max_err = err;
-        }
-    }
-    eprintln!("rope flux2: max abs err: {max_err}");
-    assert!(max_err < 1e-4, "max abs error {max_err} too high");
-}
--- a/crates/luminal_cuda_lite/src/tests/search_equivalence_fuzz.rs
+++ b/crates/luminal_cuda_lite/src/tests/search_equivalence_fuzz.rs
@@ -1,374 +0,0 @@
-//! End-to-end e-graph search-space equivalence fuzz tests.
-//!
-//! These tests do not compare against a hand-written reference. They assert the
-//! stronger search invariant: every selectable LLIR graph from the same e-graph
-//! must produce the same outputs for the same runtime inputs.
-
-#[allow(dead_code)]
-#[path = "../../../../examples/llama/src/model.rs"]
-mod llama_model;
-
-use half::bf16;
-use luminal::{dtype::DType, prelude::*, shape::Expression};
-use rand::{Rng, SeedableRng, rngs::StdRng};
-
-use super::utilities::{CudaSearchEquivalenceFuzzer, get_cuda_stream, random_f32_vec};
-
-const SEARCH_EQUIV_SAMPLES: usize = 32;
-
-fn random_bf16_vec(n: usize, seed: u64, low: f32, high: f32) -> Vec<bf16> {
-    random_f32_vec(n, seed, low, high)
-        .into_iter()
-        .map(bf16::from_f32)
-        .collect()
-}
-
-fn rms_norm(x: GraphTensor, weight: GraphTensor, eps: f32) -> GraphTensor {
-    let normed = x.std_norm(x.shape.last_axis(), eps);
-    normed * weight.expand_lhs(&x.dims()[..x.dims().len() - 1])
-}
-
-#[allow(clippy::excessive_precision)]
-fn gemma_gelu(x: GraphTensor) -> GraphTensor {
-    let scaled = 1.5957691216 * x * (1. + 0.044715 * x * x);
-    x * scaled.sigmoid()
-}
-
-fn gather_experts(
-    graph_source: GraphTensor,
-    top_k_indices: GraphTensor,
-    weights: GraphTensor,
-) -> GraphTensor {
-    let (_, d1, d2) = weights.dims3();
-    let io = d1 * d2;
-    let base = top_k_indices * io;
-    let within = graph_source.graph().iota(Expression::from('z'), (d1, d2));
-    let n_base = base.dims().len();
-    let exp_base = base.expand_dim(n_base, d1).expand_dim(n_base + 1, d2);
-    let mut exp_within = within;
-    for (axis, dim) in base.dims().iter().enumerate() {
-        exp_within = exp_within.expand_dim(axis, *dim);
-    }
-    let expert_flat_idx = exp_base + exp_within;
-    weights.gather(expert_flat_idx)
-}
-
-#[test]
-fn llama_architecture_search_space_equivalence_fuzz() {
-    let Some(stream) = get_cuda_stream() else {
-        return;
-    };
-
-    const SEQ: usize = 2;
-    const CTX: usize = 3;
-    const SLOTS: usize = 4;
-
-    let config = llama_model::LlamaConfig {
-        layers: 2,
-        hidden: 32,
-        intermediate: 64,
-        head_dim: 8,
-        kv_groups: 2,
-        vocab_size: 64,
-    };
-
-    let mut cx = Graph::default();
-    cx.set_dim('s', SEQ);
-    cx.set_dim('c', CTX);
-
-    let input = cx.named_tensor("input", 's').as_dtype(DType::Int);
-    let q_pos = cx.named_tensor("q_pos", 's').as_dtype(DType::Int);
-    let scatter_idx = cx.named_tensor("scatter_idx", 's').as_dtype(DType::Int);
-    let gather_idx = cx.named_tensor("gather_idx", 'c').as_dtype(DType::Int);
-    let attn_mask = cx.named_tensor("attn_mask", ('s', 'c'));
-    let kv_cache = llama_model::KVCache::new_with_config(&mut cx, SLOTS, config);
-    let llama = llama_model::Llama::init_with_config(&mut cx, config);
-
-    let (logits, cache_outputs) =
-        llama.forward(input, q_pos, scatter_idx, gather_idx, attn_mask, &kv_cache);
-    let logits = logits.output();
-    let mut fuzzer = CudaSearchEquivalenceFuzzer::new(&mut cx, &stream)
-        .seed(0x5EED_1234)
-        .samples(SEARCH_EQUIV_SAMPLES)
-        .generation_size(8)
-        .mutations(3)
-        .build_options(BuildSearchSpaceOptions::new().max_memory_mib(512))
-        .output_f32(logits.id, "logits", 3e-3, 3e-3);
-    for (layer, (k_out, v_out)) in cache_outputs.into_iter().enumerate() {
-        let k_out = k_out.output();
-        let v_out = v_out.output();
-        fuzzer = fuzzer.output_f32(k_out.id, format!("layer{layer}.k_cache"), 3e-3, 3e-3);
-        fuzzer = fuzzer.output_f32(v_out.id, format!("layer{layer}.v_cache"), 3e-3, 3e-3);
-    }
-
-    let mut rng = StdRng::seed_from_u64(0x11A_AA55);
-    fuzzer = fuzzer
-        .input_i32(input.id, vec![3, 17])
-        .input_i32(q_pos.id, vec![1, 2])
-        .input_i32(scatter_idx.id, vec![1, 2])
-        .input_i32(gather_idx.id, vec![0, 1, 2])
-        .input_f32(attn_mask.id, vec![0.0, 0.0, -1e4, 0.0, 0.0, 0.0]);
-
-    let kv_dim = config.kv_dim();
-    for tensor in kv_cache.tensors() {
-        fuzzer = fuzzer.input_f32(tensor.id, vec![0.0; SLOTS * kv_dim]);
-    }
-    for tensor in llama.parameter_tensors() {
-        let elements = tensor
-            .dims()
-            .iter()
-            .map(|dim| dim.to_usize().expect("tiny llama test uses static params"))
-            .product::<usize>();
-        let data = (0..elements)
-            .map(|_| rng.random_range(-0.08f32..0.08f32))
-            .collect::<Vec<_>>();
-        fuzzer = fuzzer.input_f32(tensor.id, data);
-    }
-
-    let report = fuzzer.run();
-    eprintln!("llama search equivalence fuzz report: {report:?}");
-}
-
-#[test]
-fn gemma_architecture_search_space_equivalence_fuzz() {
-    let Some(stream) = get_cuda_stream() else {
-        return;
-    };
-
-    const SEQ: usize = 2;
-    const HIDDEN: usize = 32;
-    const Q_DIM: usize = 24;
-    const INTERMEDIATE: usize = 64;
-    const EPS: f32 = 1e-6;
-
-    let mut cx = Graph::default();
-    let input = cx.tensor((SEQ, HIDDEN));
-    let attn_norm_w = cx.tensor(HIDDEN);
-    let post_attn_norm_w = cx.tensor(HIDDEN);
-    let pre_ff_norm_w = cx.tensor(HIDDEN);
-    let post_ff_norm_w = cx.tensor(HIDDEN);
-    let proj_w = cx.tensor((Q_DIM, HIDDEN));
-    let o_proj_w = cx.tensor((HIDDEN, Q_DIM));
-    let w_gate = cx.tensor((INTERMEDIATE, HIDDEN));
-    let w_up = cx.tensor((INTERMEDIATE, HIDDEN));
-    let w_down = cx.tensor((HIDDEN, INTERMEDIATE));
-
-    let normed = rms_norm(input, attn_norm_w, EPS);
-    let proj_out = normed.matmul(proj_w.t()).matmul(o_proj_w.t());
-    let attn_normed = rms_norm(proj_out, post_attn_norm_w, EPS);
-    let x = input + attn_normed;
-    let ff_normed = rms_norm(x, pre_ff_norm_w, EPS);
-    let mlp_out =
-        (gemma_gelu(ff_normed.matmul(w_gate.t())) * ff_normed.matmul(w_up.t())).matmul(w_down.t());
-    let mlp_normed = rms_norm(mlp_out, post_ff_norm_w, EPS);
-    let out = (x + mlp_normed).output();
-
-    let report = CudaSearchEquivalenceFuzzer::new(&mut cx, &stream)
-        .seed(0x6E4D_4DAA)
-        .samples(SEARCH_EQUIV_SAMPLES)
-        .generation_size(8)
-        .mutations(3)
-        .build_options(BuildSearchSpaceOptions::new().max_memory_mib(512))
-        .input_f32(input.id, random_f32_vec(SEQ * HIDDEN, 101, -0.15, 0.15))
-        .input_f32(attn_norm_w.id, random_f32_vec(HIDDEN, 102, 0.7, 1.3))
-        .input_f32(post_attn_norm_w.id, random_f32_vec(HIDDEN, 103, 0.7, 1.3))
-        .input_f32(pre_ff_norm_w.id, random_f32_vec(HIDDEN, 104, 0.7, 1.3))
-        .input_f32(post_ff_norm_w.id, random_f32_vec(HIDDEN, 105, 0.7, 1.3))
-        .input_f32(proj_w.id, random_f32_vec(Q_DIM * HIDDEN, 106, -0.08, 0.08))
-        .input_f32(
-            o_proj_w.id,
-            random_f32_vec(HIDDEN * Q_DIM, 107, -0.08, 0.08),
-        )
-        .input_f32(
-            w_gate.id,
-            random_f32_vec(INTERMEDIATE * HIDDEN, 108, -0.08, 0.08),
-        )
-        .input_f32(
-            w_up.id,
-            random_f32_vec(INTERMEDIATE * HIDDEN, 109, -0.08, 0.08),
-        )
-        .input_f32(
-            w_down.id,
-            random_f32_vec(HIDDEN * INTERMEDIATE, 110, -0.08, 0.08),
-        )
-        .output_f32(out.id, "gemma_block", 5e-3, 5e-3)
-        .run();
-    eprintln!("gemma search equivalence fuzz report: {report:?}");
-}
-
-#[test]
-fn moe_architecture_search_space_equivalence_fuzz() {
-    let Some(stream) = get_cuda_stream() else {
-        return;
-    };
-
-    const SEQ: usize = 2;
-    const HIDDEN: usize = 16;
-    const NUM_EXPERTS: usize = 8;
-    const TOP_K: usize = 2;
-    const MOE_INTERMEDIATE: usize = 6;
-    const EPS: f32 = 1e-6;
-
-    let mut cx = Graph::default();
-    let router_input = cx.tensor(('s', HIDDEN));
-    let expert_input = cx.tensor(('s', HIDDEN));
-    let router_scale = cx.tensor(HIDDEN);
-    let router_proj = cx.tensor((NUM_EXPERTS, HIDDEN));
-    let per_expert_scale = cx.tensor(NUM_EXPERTS);
-    let gate_up_weights = cx
-        .tensor((NUM_EXPERTS, MOE_INTERMEDIATE * 2, HIDDEN))
-        .as_dtype(DType::Bf16);
-    let down_weights = cx
-        .tensor((NUM_EXPERTS, HIDDEN, MOE_INTERMEDIATE))
-        .as_dtype(DType::Bf16);
-
-    let n = router_input.dims().len();
-    let e_dim = *router_proj.dims().first().unwrap();
-    let k_expr = Expression::from(TOP_K);
-
-    let router_hidden = router_input.std_norm(n - 1, EPS)
-        * router_scale.expand_lhs(&router_input.dims()[..n - 1])
-        * (HIDDEN as f32).sqrt().recip();
-    let routing_weights = router_hidden.matmul(router_proj.t()).softmax(n - 1);
-
-    let top_k_indices = routing_weights.topk_indexes(TOP_K, n - 1);
-    let row_offsets = router_input
-        .graph()
-        .iota(Expression::from('z') / k_expr * e_dim, top_k_indices.dims());
-    let routing_flat_idx = row_offsets + top_k_indices;
-    let top_k_values = routing_weights.gather(routing_flat_idx);
-    let top_k_norm = top_k_values.sum(n - 1).expand_dim(n - 1, TOP_K);
-    let top_k_weights = (top_k_values / top_k_norm) * per_expert_scale.gather(top_k_indices);
-
-    let gate_up_gathered =
-        gather_experts(expert_input, top_k_indices, gate_up_weights).cast(DType::F32);
-    let x_exp = expert_input.expand_dim(n - 1, TOP_K).unsqueeze(n);
-    let gate_up_out = x_exp.matmul(gate_up_gathered.transpose(2, 3)).squeeze(n);
-    let gate = gate_up_out.slice((.., .., ..MOE_INTERMEDIATE));
-    let up = gate_up_out.slice((.., .., MOE_INTERMEDIATE..));
-    let hidden = gemma_gelu(gate) * up;
-
-    let down_gathered = gather_experts(expert_input, top_k_indices, down_weights).cast(DType::F32);
-    let down_out = hidden
-        .unsqueeze(2)
-        .matmul(down_gathered.transpose(2, 3))
-        .squeeze(2);
-    let mut weights_exp = top_k_weights.unsqueeze(top_k_weights.dims().len());
-    weights_exp.shape.expand(down_out.dims());
-    let out = (down_out * weights_exp).sum(n - 1).output();
-    cx.set_dim('s', SEQ);
-
-    let report = CudaSearchEquivalenceFuzzer::new(&mut cx, &stream)
-        .seed(0x0DEE_55EE)
-        .samples(SEARCH_EQUIV_SAMPLES)
-        .generation_size(8)
-        .mutations(3)
-        .build_options(BuildSearchSpaceOptions::new().max_memory_mib(512))
-        .input_f32(
-            router_input.id,
-            random_f32_vec(SEQ * HIDDEN, 201, -0.15, 0.15),
-        )
-        .input_f32(
-            expert_input.id,
-            random_f32_vec(SEQ * HIDDEN, 202, -0.15, 0.15),
-        )
-        .input_f32(router_scale.id, random_f32_vec(HIDDEN, 203, 0.7, 1.3))
-        .input_f32(
-            router_proj.id,
-            random_f32_vec(NUM_EXPERTS * HIDDEN, 204, -0.2, 0.2),
-        )
-        .input_f32(
-            per_expert_scale.id,
-            random_f32_vec(NUM_EXPERTS, 205, 0.5, 1.5),
-        )
-        .input_bf16(
-            gate_up_weights.id,
-            random_bf16_vec(NUM_EXPERTS * MOE_INTERMEDIATE * 2 * HIDDEN, 206, -0.1, 0.1),
-        )
-        .input_bf16(
-            down_weights.id,
-            random_bf16_vec(NUM_EXPERTS * HIDDEN * MOE_INTERMEDIATE, 207, -0.1, 0.1),
-        )
-        .output_f32(out.id, "gemma_moe_block", 5e-2, 5e-2)
-        .run();
-    eprintln!("moe search equivalence fuzz report: {report:?}");
-}
-
-#[test]
-fn moe_architecture_native_reference_fuzz() {
-    let Some(stream) = get_cuda_stream() else {
-        return;
-    };
-
-    const SEQ: usize = 2;
-    const HIDDEN: usize = 16;
-    const NUM_EXPERTS: usize = 8;
-    const TOP_K: usize = 2;
-    const MOE_INTERMEDIATE: usize = 6;
-
-    let mut cx = Graph::default();
-    let input = cx.tensor(('s', HIDDEN));
-    let router = cx.tensor((NUM_EXPERTS, HIDDEN));
-    let gate_up_weights = cx
-        .tensor((NUM_EXPERTS, MOE_INTERMEDIATE * 2, HIDDEN))
-        .as_dtype(DType::Bf16);
-    let down_weights = cx
-        .tensor((NUM_EXPERTS, HIDDEN, MOE_INTERMEDIATE))
-        .as_dtype(DType::Bf16);
-
-    let n = input.dims().len();
-    let e_dim = *router.dims().first().unwrap();
-    let k_expr = Expression::from(TOP_K);
-
-    let routing_weights = input.matmul(router.t()).softmax(n - 1);
-    let top_k_indices = routing_weights.topk_indexes(TOP_K, n - 1);
-    let row_offsets = input
-        .graph()
-        .iota(Expression::from('z') / k_expr * e_dim, top_k_indices.dims());
-    let routing_flat_idx = row_offsets + top_k_indices;
-    let top_k_values = routing_weights.gather(routing_flat_idx);
-    let top_k_weights = top_k_values / top_k_values.sum(n - 1).expand_dim(n - 1, TOP_K);
-
-    let gate_up_gathered = gather_experts(input, top_k_indices, gate_up_weights).cast(DType::F32);
-    let input_exp = input.expand_dim(n - 1, TOP_K).unsqueeze(n);
-    let gate_up_out = input_exp
-        .matmul(gate_up_gathered.transpose(2, 3))
-        .squeeze(n);
-    let gate = gate_up_out.slice((.., .., ..MOE_INTERMEDIATE));
-    let up = gate_up_out.slice((.., .., MOE_INTERMEDIATE..));
-    let hidden = gate.silu() * up;
-
-    let down_gathered = gather_experts(input, top_k_indices, down_weights).cast(DType::F32);
-    let down_out = hidden
-        .unsqueeze(2)
-        .matmul(down_gathered.transpose(2, 3))
-        .squeeze(2);
-    let mut weights_exp = top_k_weights.unsqueeze(top_k_weights.dims().len());
-    weights_exp.shape.expand(down_out.dims());
-    let out = (down_out * weights_exp).sum(n - 1).output();
-    cx.set_dim('s', SEQ);
-
-    let report = CudaSearchEquivalenceFuzzer::new(&mut cx, &stream)
-        .seed(0x51A7_E5ED)
-        .samples(SEARCH_EQUIV_SAMPLES)
-        .generation_size(8)
-        .mutations(3)
-        .build_options(BuildSearchSpaceOptions::new().max_memory_mib(512))
-        .native_reference()
-        .input_f32(input.id, random_f32_vec(SEQ * HIDDEN, 301, -0.15, 0.15))
-        .input_f32(
-            router.id,
-            random_f32_vec(NUM_EXPERTS * HIDDEN, 302, -0.2, 0.2),
-        )
-        .input_bf16(
-            gate_up_weights.id,
-            random_bf16_vec(NUM_EXPERTS * MOE_INTERMEDIATE * 2 * HIDDEN, 303, -0.1, 0.1),
-        )
-        .input_bf16(
-            down_weights.id,
-            random_bf16_vec(NUM_EXPERTS * HIDDEN * MOE_INTERMEDIATE, 304, -0.1, 0.1),
-        )
-        .output_f32(out.id, "qwen_swiglu_moe_native_reference", 6e-2, 6e-2)
-        .run();
-    eprintln!("moe native-reference fuzz report: {report:?}");
-}
--- a/crates/luminal_cuda_lite/src/tests/utilities.rs
+++ b/crates/luminal_cuda_lite/src/tests/utilities.rs
@@ -2,8 +2,7 @@ use candle_core::{Device, Tensor, WithDType};
 use cudarc::driver::CudaContext;
 use half::{bf16, f16};
 use luminal::egglog_utils::{
-    EGraphChoiceSet, egglog_to_llir, extract_generation, hash_choice_set, random_initial_choice,
-    validate_choice_set,
+    egglog_to_llir, extract_generation, hash_choice_set, random_initial_choice, validate_choice_set,
 };
 use luminal::prelude::*;
 use num_traits::{Num, Signed};
@@ -129,399 +128,6 @@ pub fn get_cuda_stream() -> Option<Arc<cudarc::driver::CudaStream>> {
    Some(ctx.default_stream())
 }

-#[derive(Debug, Clone)]
-pub enum CudaFuzzInput {
-    F32(NodeIndex, Vec<f32>),
-    Bf16(NodeIndex, Vec<bf16>),
-    I32(NodeIndex, Vec<i32>),
-}
-
-impl CudaFuzzInput {
-    fn apply(&self, rt: &mut CudaRuntime) {
-        match self {
-            Self::F32(id, data) => rt.set_data(*id, data.clone()),
-            Self::Bf16(id, data) => rt.set_data(*id, data.clone()),
-            Self::I32(id, data) => rt.set_data(*id, data.clone()),
-        }
-    }
-
-    fn apply_native(&self, rt: &mut NativeRuntime) {
-        match self {
-            Self::F32(id, data) => rt.set_data(*id, data.clone()),
-            Self::Bf16(id, data) => rt.set_data(*id, data.clone()),
-            Self::I32(id, data) => rt.set_data(*id, data.clone()),
-        }
-    }
-}
-
-#[derive(Debug, Clone)]
-pub struct F32OutputCheck {
-    pub id: NodeIndex,
-    pub name: String,
-    pub rtol: f32,
-    pub atol: f32,
-}
-
-impl F32OutputCheck {
-    pub fn new(id: NodeIndex, name: impl Into<String>, rtol: f32, atol: f32) -> Self {
-        Self {
-            id,
-            name: name.into(),
-            rtol,
-            atol,
-        }
-    }
-}
-
-#[derive(Debug, Clone)]
-pub struct SearchEquivalenceFuzzConfig {
-    pub seed: u64,
-    pub samples: usize,
-    pub generation_size: usize,
-    pub mutations: usize,
-    pub max_attempts: usize,
-    pub build_options: BuildSearchSpaceOptions,
-    pub reference: SearchEquivalenceReference,
-}
-
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-pub enum SearchEquivalenceReference {
-    FirstCudaExtraction,
-    NativeRuntime,
-}
-
-impl Default for SearchEquivalenceFuzzConfig {
-    fn default() -> Self {
-        Self {
-            seed: 0,
-            samples: 32,
-            generation_size: 16,
-            mutations: 2,
-            max_attempts: 1_000,
-            build_options: BuildSearchSpaceOptions::default(),
-            reference: SearchEquivalenceReference::FirstCudaExtraction,
-        }
-    }
-}
-
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-pub struct SearchEquivalenceFuzzReport {
-    pub tested: usize,
-    pub skipped_invalid: usize,
-}
-
-pub struct CudaSearchEquivalenceFuzzer<'a> {
-    cx: &'a mut Graph,
-    stream: &'a Arc<cudarc::driver::CudaStream>,
-    inputs: Vec<CudaFuzzInput>,
-    outputs: Vec<F32OutputCheck>,
-    config: SearchEquivalenceFuzzConfig,
-}
-
-impl<'a> CudaSearchEquivalenceFuzzer<'a> {
-    pub fn new(cx: &'a mut Graph, stream: &'a Arc<cudarc::driver::CudaStream>) -> Self {
-        Self {
-            cx,
-            stream,
-            inputs: Vec::new(),
-            outputs: Vec::new(),
-            config: SearchEquivalenceFuzzConfig::default(),
-        }
-    }
-
-    pub fn seed(mut self, seed: u64) -> Self {
-        self.config.seed = seed;
-        self
-    }
-
-    pub fn samples(mut self, samples: usize) -> Self {
-        self.config.samples = samples;
-        self
-    }
-
-    pub fn generation_size(mut self, generation_size: usize) -> Self {
-        self.config.generation_size = generation_size;
-        self
-    }
-
-    pub fn mutations(mut self, mutations: usize) -> Self {
-        self.config.mutations = mutations;
-        self
-    }
-
-    pub fn build_options(mut self, build_options: BuildSearchSpaceOptions) -> Self {
-        self.config.build_options = build_options;
-        self
-    }
-
-    pub fn native_reference(mut self) -> Self {
-        self.config.reference = SearchEquivalenceReference::NativeRuntime;
-        self
-    }
-
-    pub fn input_f32(mut self, id: NodeIndex, data: Vec<f32>) -> Self {
-        self.inputs.push(CudaFuzzInput::F32(id, data));
-        self
-    }
-
-    pub fn input_bf16(mut self, id: NodeIndex, data: Vec<bf16>) -> Self {
-        self.inputs.push(CudaFuzzInput::Bf16(id, data));
-        self
-    }
-
-    pub fn input_i32(mut self, id: NodeIndex, data: Vec<i32>) -> Self {
-        self.inputs.push(CudaFuzzInput::I32(id, data));
-        self
-    }
-
-    pub fn output_f32(
-        mut self,
-        id: NodeIndex,
-        name: impl Into<String>,
-        rtol: f32,
-        atol: f32,
-    ) -> Self {
-        self.outputs.push(F32OutputCheck::new(id, name, rtol, atol));
-        self
-    }
-
-    pub fn run(self) -> SearchEquivalenceFuzzReport {
-        fuzz_cuda_search_space_equivalence(
-            self.cx,
-            self.stream,
-            &self.inputs,
-            &self.outputs,
-            self.config,
-        )
-    }
-}
-
-/// End-to-end search-space equivalence fuzzing for CUDA.
-///
-/// This builds the normal CUDA e-graph search space, extracts random selectable
-/// LLIR graphs, runs each with identical inputs, and verifies every requested
-/// f32 output matches the first valid extraction. The reference is intentionally
-/// another selected LLIR graph, not a hand-written CPU implementation: this
-/// catches cases where supposedly equivalent e-graph choices diverge.
-pub fn fuzz_cuda_search_space_equivalence(
-    cx: &mut Graph,
-    stream: &Arc<cudarc::driver::CudaStream>,
-    inputs: &[CudaFuzzInput],
-    outputs: &[F32OutputCheck],
-    config: SearchEquivalenceFuzzConfig,
-) -> SearchEquivalenceFuzzReport {
-    assert!(
-        !outputs.is_empty(),
-        "fuzz harness needs at least one output"
-    );
-
-    let native_reference_outputs = if config.reference == SearchEquivalenceReference::NativeRuntime
-    {
-        cx.build_search_space::<NativeRuntime>();
-        let mut native_rng = StdRng::seed_from_u64(config.seed);
-        let mut native_rt = cx.search_options(
-            NativeRuntime::default(),
-            SearchOptions::new(1),
-            &mut native_rng,
-        );
-        for input in inputs {
-            input.apply_native(&mut native_rt);
-        }
-        native_rt.execute(&cx.dyn_map);
-        Some(
-            outputs
-                .iter()
-                .map(|out| native_rt.get_f32(out.id).clone())
-                .collect::<Vec<_>>(),
-        )
-    } else {
-        None
-    };
-
-    cx.build_search_space_with_options::<CudaRuntime>(config.build_options);
-
-    let egraph = cx.egraph().expect("search space should be built");
-    let ops = cx.egglog_ops().expect("search ops should be built");
-    let seed = if native_reference_outputs.is_some() {
-        config.seed.wrapping_add(0xC0DA_C0DA)
-    } else {
-        config.seed
-    };
-    let mut rng = StdRng::seed_from_u64(seed);
-    let mut prev_selected = FxHashSet::default();
-    let mut base = random_initial_choice(egraph, &mut rng);
-    prev_selected.insert(hash_choice_set(&base));
-
-    let mut skipped_invalid = 0usize;
-    let reference_is_cuda = native_reference_outputs.is_none();
-    let (reference_hash, reference_outputs, mut tested) =
-        if let Some(reference_outputs) = native_reference_outputs {
-            (0, reference_outputs, 0usize)
-        } else {
-            let mut attempts = 0usize;
-            let (reference_hash, reference_outputs) = loop {
-                attempts += 1;
-                if attempts > config.max_attempts {
-                    panic!(
-                        "failed to extract a valid reference LLIR after {} attempts",
-                        config.max_attempts
-                    );
-                }
-                if validate_choice_set(egraph, &base, ops).is_err() {
-                    skipped_invalid += 1;
-                } else {
-                    let hash = hash_choice_set(&base);
-                    match run_choice_outputs(cx, stream, inputs, outputs, &base) {
-                        Ok(values) => break (hash, values),
-                        Err(err) => {
-                            skipped_invalid += 1;
-                            eprintln!("skipping invalid reference candidate hash={hash}: {err}");
-                        }
-                    }
-                }
-                base = random_initial_choice(egraph, &mut rng);
-                prev_selected.insert(hash_choice_set(&base));
-            };
-            (reference_hash, reference_outputs, 1usize)
-        };
-
-    let mut attempts = 0usize;
-    while tested < config.samples && attempts < config.max_attempts {
-        attempts += 1;
-        let mut candidates = extract_generation(
-            egraph,
-            &base,
-            config.generation_size,
-            config.mutations,
-            &mut prev_selected,
-            &mut rng,
-        );
-        if candidates.is_empty() {
-            let next = random_initial_choice(egraph, &mut rng);
-            prev_selected.insert(hash_choice_set(&next));
-            candidates.push(next);
-        }
-
-        for candidate in candidates {
-            if tested >= config.samples {
-                break;
-            }
-            let candidate_hash = hash_choice_set(&candidate);
-            if reference_is_cuda && candidate_hash == reference_hash {
-                continue;
-            }
-            if validate_choice_set(egraph, &candidate, ops).is_err() {
-                skipped_invalid += 1;
-                continue;
-            }
-
-            let candidate_outputs = run_choice_outputs(cx, stream, inputs, outputs, &candidate)
-                .unwrap_or_else(|err| panic!("candidate hash={candidate_hash} failed: {err}"));
-            assert_fuzz_outputs_close(
-                outputs,
-                &reference_outputs,
-                &candidate_outputs,
-                reference_hash,
-                candidate_hash,
-            );
-            base = candidate;
-            tested += 1;
-        }
-    }
-
-    assert_eq!(
-        tested, config.samples,
-        "only tested {tested}/{} LLIR samples before exhausting attempts",
-        config.samples
-    );
-    SearchEquivalenceFuzzReport {
-        tested,
-        skipped_invalid,
-    }
-}
-
-fn run_choice_outputs<'a>(
-    cx: &'a Graph,
-    stream: &Arc<cudarc::driver::CudaStream>,
-    inputs: &[CudaFuzzInput],
-    outputs: &[F32OutputCheck],
-    choices: &EGraphChoiceSet<'a>,
-) -> Result<Vec<Vec<f32>>, String> {
-    let egraph = cx.egraph().ok_or("search space was not built")?;
-    let ops = cx.egglog_ops().ok_or("search ops were not built")?;
-    let mut list_cache = FxHashMap::default();
-    let mut expr_cache = FxHashMap::default();
-    let mut llir_graph = egglog_to_llir(
-        egraph,
-        choices.clone(),
-        ops,
-        &cx.custom_ops,
-        &mut list_cache,
-        &mut expr_cache,
-        None,
-    );
-    unroll_loops_in_llir(&mut llir_graph);
-
-    let mut rt = CudaRuntime::initialize(stream.clone());
-    rt.load_llir(&llir_graph);
-    for input in inputs {
-        input.apply(&mut rt);
-    }
-    rt.execute(&cx.dyn_map);
-
-    Ok(outputs.iter().map(|out| rt.get_f32(out.id)).collect())
-}
-
-fn assert_fuzz_outputs_close(
-    outputs: &[F32OutputCheck],
-    expected: &[Vec<f32>],
-    actual: &[Vec<f32>],
-    reference_hash: u64,
-    candidate_hash: u64,
-) {
-    for ((spec, expected), actual) in outputs.iter().zip(expected.iter()).zip(actual.iter()) {
-        assert_eq!(
-            expected.len(),
-            actual.len(),
-            "output {} length mismatch for candidate hash={candidate_hash} reference hash={reference_hash}",
-            spec.name
-        );
-        let mut max_abs = 0.0f32;
-        let mut max_rel = 0.0f32;
-        let mut worst = 0usize;
-        for (i, (&a, &b)) in actual.iter().zip(expected.iter()).enumerate() {
-            assert!(
-                a.is_finite(),
-                "output {} candidate hash={candidate_hash} produced non-finite value {a} at index {i}",
-                spec.name
-            );
-            assert!(
-                b.is_finite(),
-                "output {} reference hash={reference_hash} produced non-finite value {b} at index {i}",
-                spec.name
-            );
-            let abs = (a - b).abs();
-            let rel = abs / b.abs().max(1e-12);
-            if abs > max_abs {
-                max_abs = abs;
-                max_rel = rel;
-                worst = i;
-            }
-            if abs > spec.atol + spec.rtol * b.abs() {
-                panic!(
-                    "output {} mismatch candidate hash={candidate_hash} reference hash={reference_hash} index={i} actual={a} expected={b} abs={abs} rel={rel} tolerance={}",
-                    spec.name,
-                    spec.atol + spec.rtol * b.abs()
-                );
-            }
-        }
-        eprintln!(
-            "fuzz output {} ok: candidate hash={candidate_hash} max_abs={max_abs} max_rel={max_rel} worst={worst}",
-            spec.name
-        );
-    }
-}
-
 /// Get the GPU compute capability as (major, minor).
 pub fn gpu_compute_cap() -> Option<(i32, i32)> {
    let ctx = CudaContext::new(0).ok()?;
@@ -530,15 +136,14 @@ pub fn gpu_compute_cap() -> Option<(i32, i32)> {

 /// Check if the current GPU supports the given dtype for tensor core / WMMA operations.
 pub fn gpu_supports_dtype(dtype: luminal::dtype::DType) -> bool {
-    let Some((major, minor)) = gpu_compute_cap() else {
+    let Some((major, _)) = gpu_compute_cap() else {
        return false;
    };
    match dtype {
        luminal::dtype::DType::Bf16 => major >= 8, // Ampere (sm_80+)
-        luminal::dtype::DType::F8E4M3 | luminal::dtype::DType::F8E5M2 => {
-            major > 8 || (major == 8 && minor >= 9)
-        } // Ada/Hopper (sm_89+)
-        luminal::dtype::DType::F4E2M1 | luminal::dtype::DType::F8UE8M0 => major >= 10, // Blackwell (sm_100+)
+        luminal::dtype::DType::F4E2M1
+        | luminal::dtype::DType::F8E4M3
+        | luminal::dtype::DType::F8UE8M0 => major >= 10, // Blackwell (sm_100+)
        _ => true,
    }
 }
--- a/crates/luminal_metal/Cargo.toml
+++ b/crates/luminal_metal/Cargo.toml
@@ -1,21 +1,18 @@
 [package]
 name = "luminal_metal"
 version = "0.2.0"
-edition = "2024"
+edition = "2021"
 description = "Metal backend for luminal"
 license = "MIT OR Apache-2.0"

 [dependencies]
 luminal = { path = "../.." }
-metal = { version = "0.31", features = ["mps"] }
+metal = "0.31"
 objc = "0.2"
 as-any = "0.3.2"
 itertools = "0.12.1"
-half = { version = "2.7.1", features = ["bytemuck"] }
+half = "2.7.1"
 tracing = "0.1.43"
-safetensors = "0.7.0"
-memmap2 = "0.9.9"
-bytemuck = "1.24.0"

 [dev-dependencies]
 candle-core = "0.9.2-alpha.1"
--- a/crates/luminal_metal/src/dyn_backend.rs
+++ b/crates/luminal_metal/src/dyn_backend.rs
@@ -1,7 +1,7 @@
 //! [`DynBackend`] implementation for the Metal runtime.

 use luminal::dtype::DType;
-use luminal::dyn_backend::{BackendCompileArgs, DynBackend, bytes_to_native_data, compile_backend};
+use luminal::dyn_backend::{bytes_to_native_data, compile_backend, BackendCompileArgs, DynBackend};
 use luminal::prelude::*;

 use crate::runtime::MetalRuntime;
--- a/crates/luminal_metal/src/kernel/matmul.rs
+++ b/crates/luminal_metal/src/kernel/matmul.rs
@@ -1,5 +1,227 @@
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-pub enum MPSMatrixLayout {
-    RowMajor,
-    TransposedRowMajor,
+use super::{MetalMulInfo, MetalSumReduceInfo};
+use luminal::prelude::*;
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
+pub enum MetalMatmulFamily {
+    #[default]
+    Naive,
+    RegularTiled,
+}
+
+#[derive(Debug, Clone)]
+pub struct MatmulDescriptor {
+    pub m: Expression,
+    pub n: Expression,
+    pub k: Expression,
+    pub batch_shape: Vec<Expression>,
+    pub lhs_strides: Vec<Expression>,
+    pub rhs_strides: Vec<Expression>,
+    pub out_strides: Vec<Expression>,
+    pub transpose_lhs: bool,
+    pub transpose_rhs: bool,
+}
+
+impl MatmulDescriptor {
+    pub fn from_mul_and_sum(
+        mul_info: &MetalMulInfo,
+        sum_info: &MetalSumReduceInfo,
+    ) -> Option<Self> {
+        let zero = Expression::from(0);
+        let z = Expression::from('z');
+
+        let is_simple_2d_matmul = mul_info.shape.len() == 3
+            && sum_info.shape.len() == 2
+            && mul_info.a_strides.len() == 3
+            && mul_info.b_strides.len() == 3
+            && sum_info.strides.len() == 2
+            && mul_info.shape[0] == sum_info.shape[0]
+            && mul_info.shape[1] == sum_info.shape[1]
+            && mul_info.shape[2] == sum_info.iters
+            && mul_info.a_strides[1] == zero
+            && mul_info.a_strides[2] == z
+            && mul_info.b_strides[0] == zero
+            && mul_info.b_strides[1] == z
+            && sum_info.strides[1] == z
+            && sum_info.iter_stride == z;
+
+        if !is_simple_2d_matmul {
+            return None;
+        }
+
+        Some(Self {
+            m: sum_info.shape[0],
+            n: sum_info.shape[1],
+            k: sum_info.iters,
+            batch_shape: Vec::new(),
+            lhs_strides: mul_info.a_strides.clone(),
+            rhs_strides: mul_info.b_strides.clone(),
+            out_strides: sum_info.strides.clone(),
+            transpose_lhs: false,
+            transpose_rhs: false,
+        })
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct MatmulPlan {
+    pub family: MetalMatmulFamily,
+    pub m: Expression,
+    pub n: Expression,
+    pub k: Expression,
+    pub lda: Expression,
+    pub ldb: Expression,
+    pub ldd: Expression,
+    pub batch_size: u32,
+    pub batch_stride_a: u32,
+    pub batch_stride_b: u32,
+    pub batch_stride_d: u32,
+    pub bm: u16,
+    pub bn: u16,
+    pub bk: u16,
+    pub wm: u16,
+    pub wn: u16,
+}
+
+#[derive(Debug, Default, Clone, Copy)]
+pub struct MetalMatmulPlanner;
+
+impl MetalMatmulPlanner {
+    pub fn plan(&self, desc: &MatmulDescriptor) -> MatmulPlan {
+        let family = if desc.batch_shape.is_empty()
+            && desc.m.as_num().is_some_and(|m| m >= 32)
+            && desc.n.as_num().is_some_and(|n| n >= 32)
+            && desc.k.as_num().is_some_and(|k| k >= 32)
+        {
+            MetalMatmulFamily::RegularTiled
+        } else {
+            MetalMatmulFamily::Naive
+        };
+        MatmulPlan {
+            family,
+            m: desc.m,
+            n: desc.n,
+            k: desc.k,
+            lda: desc.lhs_strides[0],
+            ldb: desc.rhs_strides[2],
+            ldd: desc.out_strides[0],
+            batch_size: 1,
+            batch_stride_a: 0,
+            batch_stride_b: 0,
+            batch_stride_d: 0,
+            bm: 16,
+            bn: 16,
+            bk: 8,
+            wm: 2,
+            wn: 2,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn descriptor_recovers_simple_2d_matmul() {
+        let mul = MetalMulInfo {
+            shape: vec![
+                Expression::from(4),
+                Expression::from(8),
+                Expression::from(16),
+            ],
+            a_strides: vec![
+                Expression::from('z') * 16,
+                Expression::from(0),
+                Expression::from('z'),
+            ],
+            b_strides: vec![
+                Expression::from(0),
+                Expression::from('z'),
+                Expression::from('z') * 8,
+            ],
+            output_strides: vec![
+                Expression::from('z') * 16,
+                Expression::from('z') * 8,
+                Expression::from('z'),
+            ],
+        };
+        let sum = MetalSumReduceInfo {
+            shape: vec![Expression::from(4), Expression::from(8)],
+            strides: vec![Expression::from('z') * 8, Expression::from('z')],
+            iters: Expression::from(16),
+            iter_stride: Expression::from('z'),
+        };
+
+        let desc = MatmulDescriptor::from_mul_and_sum(&mul, &sum).unwrap();
+        assert_eq!(desc.m, Expression::from(4));
+        assert_eq!(desc.n, Expression::from(8));
+        assert_eq!(desc.k, Expression::from(16));
+    }
+
+    #[test]
+    fn planner_keeps_small_problems_on_naive_path() {
+        let desc = MatmulDescriptor {
+            m: Expression::from(4),
+            n: Expression::from(8),
+            k: Expression::from(16),
+            batch_shape: Vec::new(),
+            lhs_strides: vec![
+                Expression::from('z') * 16,
+                Expression::from(0),
+                Expression::from('z'),
+            ],
+            rhs_strides: vec![
+                Expression::from(0),
+                Expression::from('z'),
+                Expression::from('z') * 8,
+            ],
+            out_strides: vec![Expression::from('z') * 8, Expression::from('z')],
+            transpose_lhs: false,
+            transpose_rhs: false,
+        };
+
+        let planner = MetalMatmulPlanner;
+        let plan = planner.plan(&desc);
+        assert_eq!(plan.family, MetalMatmulFamily::Naive);
+        assert_eq!(plan.bm, 16);
+        assert_eq!(plan.bn, 16);
+        assert_eq!(plan.bk, 8);
+        assert_eq!(plan.wm, 2);
+        assert_eq!(plan.wn, 2);
+        assert_eq!(plan.lda, Expression::from('z') * 16);
+        assert_eq!(plan.ldb, Expression::from('z') * 8);
+        assert_eq!(plan.ldd, Expression::from('z') * 8);
+    }
+
+    #[test]
+    fn planner_promotes_large_problems_to_regular_tiled() {
+        let desc = MatmulDescriptor {
+            m: Expression::from(64),
+            n: Expression::from(64),
+            k: Expression::from(64),
+            batch_shape: Vec::new(),
+            lhs_strides: vec![
+                Expression::from('z') * 64,
+                Expression::from(0),
+                Expression::from('z'),
+            ],
+            rhs_strides: vec![
+                Expression::from(0),
+                Expression::from('z'),
+                Expression::from('z') * 64,
+            ],
+            out_strides: vec![Expression::from('z') * 64, Expression::from('z')],
+            transpose_lhs: false,
+            transpose_rhs: false,
+        };
+
+        let planner = MetalMatmulPlanner;
+        let plan = planner.plan(&desc);
+        assert_eq!(plan.family, MetalMatmulFamily::RegularTiled);
+        assert_eq!(plan.bm, 16);
+        assert_eq!(plan.bn, 16);
+        assert_eq!(plan.bk, 8);
+        assert_eq!(plan.wm, 2);
+        assert_eq!(plan.wn, 2);
+    }
 }
--- a/crates/luminal_metal/src/kernel/mod.rs
+++ b/crates/luminal_metal/src/kernel/mod.rs
@@ -6,7 +6,7 @@ pub use ops::*;
 use luminal::dtype::DType;
 use luminal::op::EgglogOp;
 use luminal::prelude::*;
-use metal::{Buffer, CommandBufferRef, ComputeCommandEncoderRef, ComputePipelineState, Device};
+use metal::{Buffer, ComputeCommandEncoderRef, ComputePipelineState, Device};

 pub const DYN_SLOT_COUNT: usize = 26;

@@ -32,7 +32,7 @@ pub trait MetalKernelOp: EgglogOp {
        device: &Device,
        input_dtypes: &[DType],
        output_dtype: DType,
-    ) -> Option<ComputePipelineState>;
+    ) -> ComputePipelineState;

    fn infer_output_dtype(&self, input_dtypes: &[DType]) -> DType {
        input_dtypes.first().copied().unwrap_or(DType::F32)
@@ -40,7 +40,7 @@ pub trait MetalKernelOp: EgglogOp {

    fn output_size(&self) -> Expression;

-    fn encode_compute(
+    fn encode(
        &self,
        encoder: &ComputeCommandEncoderRef,
        pipeline: &ComputePipelineState,
@@ -49,26 +49,6 @@ pub trait MetalKernelOp: EgglogOp {
        dyn_map: &FxHashMap<char, usize>,
    );

-    #[allow(clippy::too_many_arguments)]
-    fn encode(
-        &self,
-        command_buffer: &CommandBufferRef,
-        pipeline: Option<&ComputePipelineState>,
-        inputs: &[&Buffer],
-        output: &Buffer,
-        dyn_map: &FxHashMap<char, usize>,
-        dyn_buffer: &Buffer,
-        _input_dtypes: &[DType],
-        _output_dtype: DType,
-    ) {
-        let pipeline = pipeline.expect("compute pipeline not compiled");
-        let encoder = command_buffer.new_compute_command_encoder();
-        let dyn_idx = inputs.len() as u64 + 1;
-        encoder.set_buffer(dyn_idx, Some(dyn_buffer), 0);
-        self.encode_compute(encoder, pipeline, inputs, output, dyn_map);
-        encoder.end_encoding();
-    }
-
    // ========================================================================
    // Performance Metrics for MBU/MFU Calculation
    // ========================================================================
@@ -93,10 +73,6 @@ pub trait MetalKernelOp: EgglogOp {
        None
    }

-    fn output_aliases_input(&self) -> Option<usize> {
-        None
-    }
-
    fn is_matmul(&self) -> bool {
        false
    }
--- a/crates/luminal_metal/src/kernel/ops.rs
+++ b/crates/luminal_metal/src/kernel/ops.rs
--- a/crates/luminal_metal/src/runtime.rs
+++ b/crates/luminal_metal/src/runtime.rs
@@ -1,31 +1,21 @@
-use crate::kernel::{DYN_SLOT_COUNT, MetalKernelOp};
-use half::{bf16, f16};
+use crate::kernel::{
+    MatmulDescriptor, MetalKernelOp, MetalMatmul, MetalMatmulPlanner, DYN_SLOT_COUNT,
+};
+use half::f16;
 use itertools::Itertools;
 use luminal::{
    dtype::DType,
-    graph::{BucketLLIR, DimBucket, Graph, LLIRGraph},
+    graph::LLIRGraph,
    hlir::{Input, NativeData, Output},
    op::{ExecutionStats, Runtime, RuntimeStats, TimingMethod},
    prelude::{
+        petgraph::{algo::toposort, prelude::StableGraph, visit::EdgeRef, Direction},
        FxHashMap, NodeIndex, ToId,
-        petgraph::{Direction, algo::toposort, prelude::StableGraph, visit::EdgeRef},
    },
 };
-use memmap2::MmapOptions;
 use metal::{Buffer, CommandQueue, ComputePipelineState, Device, MTLResourceOptions};
-use objc::rc::autoreleasepool;
 use objc::runtime::Object;
-use safetensors::{Dtype, SafeTensors};
-use std::{fs::File, time::Duration};
-
-#[derive(Clone)]
-struct MetalCompiledBucket {
-    bucket_indices: FxHashMap<char, usize>,
-    llir_graph: LLIRGraph,
-    node_dtypes: FxHashMap<NodeIndex, DType>,
-    pipelines: FxHashMap<NodeIndex, ComputePipelineState>,
-    output_alias_map: FxHashMap<NodeIndex, NodeIndex>,
-}
+use std::time::Duration;

 pub struct MetalRuntime {
    device: Device,
@@ -44,124 +34,83 @@ pub struct MetalRuntime {
    node_dtypes: FxHashMap<NodeIndex, DType>,
    /// Compiled pipeline states for each kernel node
    pipelines: FxHashMap<NodeIndex, ComputePipelineState>,
-    /// LLIR output node -> input node whose buffer contains the output.
-    output_alias_map: FxHashMap<NodeIndex, NodeIndex>,
-    /// Bucket definitions for dynamic dimensions.
-    dim_buckets: FxHashMap<char, Vec<DimBucket>>,
-    /// Compiled LLIR variants, one per bucket combination.
-    compiled_buckets: Vec<MetalCompiledBucket>,
-    /// Currently active compiled bucket.
-    active_bucket: usize,
 }

 impl MetalRuntime {
-    fn input_dtype(&self, id: NodeIndex) -> Option<DType> {
-        self.llir_graph.node_indices().find_map(|node| {
-            self.llir_graph[node]
-                .to_op::<Input>()
-                .and_then(|input| (input.node == id.index()).then_some(input.dtype))
-        })
-    }
+    fn fuse_matmuls(llir_graph: &LLIRGraph) -> LLIRGraph {
+        let mut graph = llir_graph.clone();
+        let planner = MetalMatmulPlanner;
+        let mut rewrites = Vec::new();

-    fn output_data_node(&self, id: NodeIndex) -> NodeIndex {
-        let output_id = self
-            .llir_graph
-            .node_indices()
-            .find(|n| {
-                if let Some(Output { node }) = self.llir_graph[*n].to_op::<Output>() {
-                    *node == id.index()
-                } else {
-                    false
-                }
-            })
-            .expect("Cannot find output tensor!");
+        for sum_node in graph.node_indices().collect::<Vec<_>>() {
+            let Some(sum_info) = graph[sum_node]
+                .to_dialect::<dyn MetalKernelOp>()
+                .and_then(|op| op.sum_reduce_info())
+            else {
+                continue;
+            };

-        self.llir_graph
-            .neighbors_directed(output_id, Direction::Incoming)
-            .next()
-            .unwrap()
-    }
+            let input_edges: Vec<_> = graph
+                .edges_directed(sum_node, Direction::Incoming)
+                .sorted_by_key(|e| e.id())
+                .map(|e| e.source())
+                .collect();
+            if input_edges.len() != 1 {
+                continue;
+            }

-    fn follow_aliases(&self, mut node: NodeIndex) -> NodeIndex {
-        while let Some(target) = self.output_alias_map.get(&node) {
-            node = *target;
+            let mul_node = input_edges[0];
+            let Some(mul_info) = graph[mul_node]
+                .to_dialect::<dyn MetalKernelOp>()
+                .and_then(|op| op.mul_info())
+            else {
+                continue;
+            };
+
+            let Some(desc) = MatmulDescriptor::from_mul_and_sum(&mul_info, &sum_info) else {
+                continue;
+            };
+
+            let mul_inputs: Vec<_> = graph
+                .edges_directed(mul_node, Direction::Incoming)
+                .sorted_by_key(|e| e.id())
+                .map(|e| e.source())
+                .collect();
+            if mul_inputs.len() != 2 {
+                continue;
+            }
+
+            rewrites.push((sum_node, mul_node, mul_inputs, planner.plan(&desc)));
        }
-        node
-    }

-    fn buffer_for_llir_node<'a>(
-        &'a self,
-        node: NodeIndex,
-        llir_to_hlir: &FxHashMap<NodeIndex, NodeIndex>,
-    ) -> &'a Buffer {
-        let data_node = self.follow_aliases(node);
-        if let Some(hlir_node) = llir_to_hlir.get(&data_node) {
-            self.hlir_buffers
-                .get(hlir_node)
-                .expect("Input buffer not set!")
-        } else {
-            self.buffers
-                .get(&data_node)
-                .expect("Intermediate buffer not found!")
+        for (sum_node, mul_node, mul_inputs, plan) in rewrites {
+            graph[sum_node] =
+                luminal::op::LLIROp::new::<dyn MetalKernelOp>(Box::new(MetalMatmul {
+                    m: plan.m,
+                    n: plan.n,
+                    k: plan.k,
+                    lda: plan.lda,
+                    ldb: plan.ldb,
+                    ldd: plan.ldd,
+                    family: plan.family,
+                    bm: plan.bm,
+                    bn: plan.bn,
+                    bk: plan.bk,
+                    wm: plan.wm,
+                    wn: plan.wn,
+                    batch_size: plan.batch_size,
+                    batch_stride_a: plan.batch_stride_a,
+                    batch_stride_b: plan.batch_stride_b,
+                    batch_stride_d: plan.batch_stride_d,
+                }));
+
+            graph.remove_node(mul_node);
+            graph.add_edge(mul_inputs[0], sum_node, ());
+            graph.add_edge(mul_inputs[1], sum_node, ());
        }
-    }

-    fn buffer_from_slice<T>(&self, values: &[T]) -> Buffer {
-        self.device.new_buffer_with_data(
-            values.as_ptr() as *const _,
-            std::mem::size_of_val(values) as u64,
-            MTLResourceOptions::StorageModeShared,
-        )
+        graph
    }
-
-    fn buffer_from_safetensor(
-        &self,
-        tensor: &safetensors::tensor::TensorView<'_>,
-        dtype: DType,
-    ) -> Buffer {
-        match (tensor.dtype(), dtype) {
-            (Dtype::F32, DType::F32) | (Dtype::F16, DType::F16) => {
-                let data = tensor.data();
-                self.device.new_buffer_with_data(
-                    data.as_ptr() as *const _,
-                    data.len() as u64,
-                    MTLResourceOptions::StorageModeShared,
-                )
-            }
-            (Dtype::F16, DType::F32) => {
-                let values: Vec<f32> = bytemuck::cast_slice::<u8, f16>(tensor.data())
-                    .iter()
-                    .map(|v| v.to_f32())
-                    .collect();
-                self.buffer_from_slice(&values)
-            }
-            (Dtype::BF16, DType::F32) => {
-                let values: Vec<f32> = bytemuck::cast_slice::<u8, bf16>(tensor.data())
-                    .iter()
-                    .map(|v| v.to_f32())
-                    .collect();
-                self.buffer_from_slice(&values)
-            }
-            (Dtype::F32, DType::F16) => {
-                let values: Vec<f16> = bytemuck::cast_slice::<u8, f32>(tensor.data())
-                    .iter()
-                    .map(|v| f16::from_f32(*v))
-                    .collect();
-                self.buffer_from_slice(&values)
-            }
-            (Dtype::BF16, DType::F16) => {
-                let values: Vec<f16> = bytemuck::cast_slice::<u8, bf16>(tensor.data())
-                    .iter()
-                    .map(|v| f16::from_f32(v.to_f32()))
-                    .collect();
-                self.buffer_from_slice(&values)
-            }
-            (tensor_dtype, dtype) => {
-                panic!("Cannot load safetensor dtype {tensor_dtype:?} into Metal dtype {dtype:?}")
-            }
-        }
-    }
-
    #[cfg(test)]
    pub(crate) fn contains_matmul(&self) -> bool {
        self.llir_graph.node_indices().any(|node| {
@@ -183,69 +132,29 @@ impl MetalRuntime {
            .collect()
    }

-    pub fn load_safetensors(&mut self, cx: &Graph, file_path: &str) {
-        let f = File::open(file_path).unwrap();
-        let mmap = unsafe { MmapOptions::new().map(&f).unwrap() };
-        let st = SafeTensors::deserialize(&mmap).unwrap();
-
-        for node in cx.graph.node_indices() {
-            if let Some(input) = (*cx.graph[node]).as_any().downcast_ref::<Input>()
-                && let Ok(tensor) = st.tensor(&input.label)
-            {
-                let buffer = self.buffer_from_safetensor(&tensor, input.dtype);
-                self.input_data.remove(&node);
-                self.hlir_buffers.insert(node, buffer);
-            }
-        }
-    }
-
    pub fn set_data(&mut self, id: impl ToId, data: impl Into<NativeData>) {
-        let id = id.to_id();
-        let data = data.into();
-        if let Some(dtype) = self.input_dtype(id) {
-            let buffer = self.create_input_buffer(&data, dtype);
-            self.hlir_buffers.insert(id, buffer);
-        }
-        self.input_data.insert(id, data);
-    }
-
-    pub fn set_zeros(&mut self, id: impl ToId, num_bytes: usize) {
-        let id = id.to_id();
-        let buffer = self
-            .device
-            .new_buffer(num_bytes as u64, MTLResourceOptions::StorageModeShared);
-        unsafe {
-            std::ptr::write_bytes(buffer.contents(), 0, num_bytes);
-        }
-        self.input_data.remove(&id);
-        self.hlir_buffers.insert(id, buffer);
-    }
-
-    pub fn remove_buffer(&mut self, id: impl ToId) -> Buffer {
-        let data_id = self.follow_aliases(self.output_data_node(id.to_id()));
-
-        if let Some(buffer) = self.buffers.remove(&data_id) {
-            return buffer;
-        }
-
-        if let Some(Input { node, .. }) = self.llir_graph[data_id].to_op::<Input>() {
-            return self
-                .hlir_buffers
-                .remove(&NodeIndex::new(*node))
-                .expect("Cannot find input tensor in runtime!");
-        }
-
-        panic!("Cannot find tensor in runtime!");
-    }
-
-    pub fn set_buffer(&mut self, id: impl ToId, buffer: Buffer) {
-        let id = id.to_id();
-        self.input_data.remove(&id);
-        self.hlir_buffers.insert(id, buffer);
+        self.input_data.insert(id.to_id(), data.into());
    }

    pub fn get_f32(&self, id: impl ToId) -> Vec<f32> {
-        let data_id = self.follow_aliases(self.output_data_node(id.to_id()));
+        let id = id.to_id();
+        let output_id = self
+            .llir_graph
+            .node_indices()
+            .find(|n| {
+                if let Some(Output { node }) = self.llir_graph[*n].to_op::<Output>() {
+                    *node == id.index()
+                } else {
+                    false
+                }
+            })
+            .expect("Cannot find output tensor!");
+
+        let data_id = self
+            .llir_graph
+            .neighbors_directed(output_id, Direction::Incoming)
+            .next()
+            .unwrap();

        let buffer = self
            .buffers
@@ -322,10 +231,6 @@ impl Runtime for MetalRuntime {
            llir_graph: StableGraph::default(),
            node_dtypes: FxHashMap::default(),
            pipelines: FxHashMap::default(),
-            output_alias_map: FxHashMap::default(),
-            dim_buckets: FxHashMap::default(),
-            compiled_buckets: vec![],
-            active_bucket: 0,
        }
    }

@@ -335,10 +240,50 @@ impl Runtime for MetalRuntime {

    #[tracing::instrument(skip_all)]
    fn load_llir(&mut self, llir_graph: &LLIRGraph) {
+        self.pipelines.clear();
        self.buffers.clear();
-        self.dim_buckets.clear();
-        self.compiled_buckets = vec![self.compile_bucket(FxHashMap::default(), llir_graph)];
-        self.activate_bucket(0);
+        self.hlir_buffers.clear();
+        self.node_dtypes.clear();
+        self.llir_graph = Self::fuse_matmuls(llir_graph);
+
+        let topo_order = toposort(&self.llir_graph, None).expect("Graph has cycles!");
+        for node in topo_order {
+            if let Some(input) = self.llir_graph[node].to_op::<Input>() {
+                self.node_dtypes.insert(node, input.dtype);
+                let hlir_id = NodeIndex::new(input.node);
+                if let Some(data) = self.input_data.get(&hlir_id) {
+                    let buffer = self.create_input_buffer(data, input.dtype);
+                    self.hlir_buffers.insert(hlir_id, buffer);
+                }
+                continue;
+            }
+
+            if self.llir_graph[node].to_op::<Output>().is_some() {
+                continue;
+            }
+
+            if let Some(kernel_op) = self.llir_graph[node].to_dialect::<dyn MetalKernelOp>() {
+                let input_nodes: Vec<NodeIndex> = self
+                    .llir_graph
+                    .edges_directed(node, Direction::Incoming)
+                    .sorted_by_key(|e| e.id())
+                    .map(|e| e.source())
+                    .collect();
+                let input_dtypes: Vec<DType> = input_nodes
+                    .iter()
+                    .map(|n| {
+                        self.node_dtypes
+                            .get(n)
+                            .copied()
+                            .unwrap_or_else(|| panic!("Missing inferred dtype for node {n:?}"))
+                    })
+                    .collect();
+                let output_dtype = kernel_op.infer_output_dtype(&input_dtypes);
+                let pipeline = kernel_op.compile(&self.device, &input_dtypes, output_dtype);
+                self.node_dtypes.insert(node, output_dtype);
+                self.pipelines.insert(node, pipeline);
+            }
+        }
    }

    #[tracing::instrument(skip_all)]
@@ -347,7 +292,6 @@ impl Runtime for MetalRuntime {
        llir_graph: &LLIRGraph,
        dyn_map: &FxHashMap<char, usize>,
        trials: usize,
-        _timeout: Option<std::time::Duration>,
    ) -> (Self::ProfileMetric, String) {
        self.load_llir(llir_graph);
        self.allocate_intermediate_buffers(dyn_map);
@@ -366,105 +310,73 @@ impl Runtime for MetalRuntime {

    #[tracing::instrument(skip_all)]
    fn execute(&mut self, dyn_map: &FxHashMap<char, usize>) -> Self::ExecReturn {
-        autoreleasepool(|| {
-            self.select_bucket(dyn_map);
-            self.allocate_active_intermediate_buffers(dyn_map);
-
-            let llir_to_hlir: FxHashMap<NodeIndex, NodeIndex> = self
-                .llir_graph
-                .node_indices()
-                .filter_map(|n| {
-                    if let Some(Input { node, .. }) = self.llir_graph[n].to_op::<Input>() {
-                        Some((n, NodeIndex::new(*node)))
-                    } else {
-                        None
-                    }
-                })
-                .collect();
-
-            let topo_order = toposort(&self.llir_graph, None).expect("Graph has cycles!");
-
-            self.update_dyn_buffer(dyn_map);
-            let command_buffer = self.command_queue.new_command_buffer();
-
-            for node in topo_order {
-                if self.llir_graph[node].to_op::<Input>().is_some()
-                    || self.llir_graph[node].to_op::<Output>().is_some()
-                {
-                    continue;
+        let llir_to_hlir: FxHashMap<NodeIndex, NodeIndex> = self
+            .llir_graph
+            .node_indices()
+            .filter_map(|n| {
+                if let Some(Input { node, .. }) = self.llir_graph[n].to_op::<Input>() {
+                    Some((n, NodeIndex::new(*node)))
+                } else {
+                    None
                }
+            })
+            .collect();

-                if let Some(kernel_op) = self.llir_graph[node].to_dialect::<dyn MetalKernelOp>() {
-                    let pipeline = self.pipelines.get(&node);
+        let topo_order = toposort(&self.llir_graph, None).expect("Graph has cycles!");

-                    let input_nodes: Vec<NodeIndex> = self
-                        .llir_graph
-                        .edges_directed(node, Direction::Incoming)
-                        .sorted_by_key(|e| e.id())
-                        .map(|e| e.source())
-                        .collect();
+        self.update_dyn_buffer(dyn_map);
+        let command_buffer = self.command_queue.new_command_buffer();
+        let encoder = command_buffer.new_compute_command_encoder();

-                    let input_buffers: Vec<&Buffer> = input_nodes
-                        .iter()
-                        .map(|&n| self.buffer_for_llir_node(n, &llir_to_hlir))
-                        .collect();
-                    let input_dtypes: Vec<DType> = input_nodes
-                        .iter()
-                        .map(|n| {
-                            self.node_dtypes
-                                .get(n)
-                                .copied()
-                                .unwrap_or_else(|| panic!("Missing inferred dtype for node {n:?}"))
-                        })
-                        .collect();
-
-                    let output_buffer = if let Some(alias_idx) = kernel_op.output_aliases_input() {
-                        input_buffers[alias_idx]
-                    } else {
-                        self.buffers
-                            .get(&node)
-                            .expect("Output buffer not allocated!")
-                    };
-                    let output_dtype = self.node_dtypes.get(&node).copied().unwrap_or(DType::F32);
-
-                    kernel_op.encode(
-                        command_buffer,
-                        pipeline,
-                        &input_buffers,
-                        output_buffer,
-                        dyn_map,
-                        &self.dyn_buffer,
-                        &input_dtypes,
-                        output_dtype,
-                    );
-                }
+        for node in topo_order {
+            if self.llir_graph[node].to_op::<Input>().is_some()
+                || self.llir_graph[node].to_op::<Output>().is_some()
+            {
+                continue;
            }

-            command_buffer.commit();
-            command_buffer.wait_until_completed();
-        });
-    }
+            if let Some(kernel_op) = self.llir_graph[node].to_dialect::<dyn MetalKernelOp>() {
+                let pipeline = self.pipelines.get(&node).expect("Pipeline not compiled!");

-    fn clear_intermediate_buffers(&mut self) {
-        self.buffers.clear();
-    }
+                let input_nodes: Vec<NodeIndex> = self
+                    .llir_graph
+                    .edges_directed(node, Direction::Incoming)
+                    .sorted_by_key(|e| e.id())
+                    .map(|e| e.source())
+                    .collect();

-    fn load_llir_buckets(
-        &mut self,
-        dim_buckets: &FxHashMap<char, Vec<DimBucket>>,
-        bucket_llirs: &[BucketLLIR],
-    ) {
-        self.buffers.clear();
-        self.dim_buckets = dim_buckets.clone();
-        self.compiled_buckets = bucket_llirs
-            .iter()
-            .map(|(bucket_indices, _, llir)| self.compile_bucket(bucket_indices.clone(), llir))
-            .collect();
-        assert!(
-            !self.compiled_buckets.is_empty(),
-            "Metal runtime received no bucketed LLIRs"
-        );
-        self.activate_bucket(0);
+                let input_buffers: Vec<&Buffer> = input_nodes
+                    .iter()
+                    .map(|&n| {
+                        if let Some(hlir_node) = llir_to_hlir.get(&n) {
+                            self.hlir_buffers
+                                .get(hlir_node)
+                                .expect("Input buffer not set!")
+                        } else {
+                            self.buffers
+                                .get(&n)
+                                .expect("Intermediate buffer not found!")
+                        }
+                    })
+                    .collect();
+
+                let output_buffer = self
+                    .buffers
+                    .get(&node)
+                    .expect("Output buffer not allocated!");
+
+                // Bind dyn dims right after the output slot:
+                // [inputs..., output, dyn, bytes...]
+                let dyn_idx = input_buffers.len() as u64 + 1;
+                encoder.set_buffer(dyn_idx, Some(&self.dyn_buffer), 0);
+
+                kernel_op.encode(encoder, pipeline, &input_buffers, output_buffer, dyn_map);
+            }
+        }
+
+        encoder.end_encoding();
+        command_buffer.commit();
+        command_buffer.wait_until_completed();
    }
 }

@@ -525,164 +437,23 @@ impl MetalRuntime {
    }

    pub fn allocate_intermediate_buffers(&mut self, dyn_map: &FxHashMap<char, usize>) {
-        self.select_bucket(dyn_map);
-        self.allocate_active_intermediate_buffers(dyn_map);
-    }
-
-    fn allocate_active_intermediate_buffers(&mut self, dyn_map: &FxHashMap<char, usize>) {
-        let mut planned = Vec::new();
-
        for node in self.llir_graph.node_indices() {
            if self.llir_graph[node].to_op::<Input>().is_some() {
                continue;
            }

            if let Some(kernel_op) = self.llir_graph[node].to_dialect::<dyn MetalKernelOp>() {
-                if kernel_op.output_aliases_input().is_some() {
-                    continue;
-                }
                let size = kernel_op.output_size().exec(dyn_map).unwrap();
                let dtype = self.node_dtypes.get(&node).copied().unwrap_or(DType::F32);
-                let bytes = (size * dtype.bits().div_ceil(8)) as u64;
-                let needs_buffer = self
-                    .buffers
-                    .get(&node)
-                    .is_none_or(|buffer| buffer.length() != bytes);
-
-                planned.push((node, bytes, needs_buffer));
-            }
-        }
-
-        for (node, bytes, needs_buffer) in planned {
-            if needs_buffer {
-                let buffer = self
-                    .device
-                    .new_buffer(bytes, MTLResourceOptions::StorageModeShared);
+                let buffer = self.device.new_buffer(
+                    (size * dtype.bits().div_ceil(8)) as u64,
+                    MTLResourceOptions::StorageModeShared,
+                );
                self.buffers.insert(node, buffer);
            }
        }
    }

-    fn compile_bucket(
-        &self,
-        bucket_indices: FxHashMap<char, usize>,
-        llir_graph: &LLIRGraph,
-    ) -> MetalCompiledBucket {
-        let mut node_dtypes = FxHashMap::default();
-        let mut pipelines = FxHashMap::default();
-        let mut output_alias_map = FxHashMap::default();
-        let llir_graph = llir_graph.clone();
-
-        let topo_order = toposort(&llir_graph, None).expect("Graph has cycles!");
-        for node in topo_order {
-            if let Some(input) = llir_graph[node].to_op::<Input>() {
-                node_dtypes.insert(node, input.dtype);
-                continue;
-            }
-
-            if llir_graph[node].to_op::<Output>().is_some() {
-                continue;
-            }
-
-            if let Some(kernel_op) = llir_graph[node].to_dialect::<dyn MetalKernelOp>() {
-                let input_nodes: Vec<NodeIndex> = llir_graph
-                    .edges_directed(node, Direction::Incoming)
-                    .sorted_by_key(|e| e.id())
-                    .map(|e| e.source())
-                    .collect();
-                let input_dtypes: Vec<DType> = input_nodes
-                    .iter()
-                    .map(|n| {
-                        node_dtypes
-                            .get(n)
-                            .copied()
-                            .unwrap_or_else(|| panic!("Missing inferred dtype for node {n:?}"))
-                    })
-                    .collect();
-                let output_dtype = kernel_op.infer_output_dtype(&input_dtypes);
-                let pipeline = kernel_op.compile(&self.device, &input_dtypes, output_dtype);
-                node_dtypes.insert(node, output_dtype);
-                if let Some(pipeline) = pipeline {
-                    pipelines.insert(node, pipeline);
-                }
-                if let Some(input_idx) = kernel_op.output_aliases_input()
-                    && let Some(target) = input_nodes.get(input_idx).copied()
-                {
-                    output_alias_map.insert(node, target);
-                }
-            } else {
-                panic!("Metal runtime cannot execute unlowered LLIR node {node:?}");
-            }
-        }
-
-        MetalCompiledBucket {
-            bucket_indices,
-            llir_graph,
-            node_dtypes,
-            pipelines,
-            output_alias_map,
-        }
-    }
-
-    fn activate_bucket(&mut self, index: usize) {
-        let bucket = self
-            .compiled_buckets
-            .get(index)
-            .unwrap_or_else(|| panic!("Metal bucket index {index} is not compiled"))
-            .clone();
-        self.active_bucket = index;
-        self.llir_graph = bucket.llir_graph;
-        self.node_dtypes = bucket.node_dtypes;
-        self.pipelines = bucket.pipelines;
-        self.output_alias_map = bucket.output_alias_map;
-        self.refresh_input_data_buffers();
-        self.buffers.clear();
-    }
-
-    fn refresh_input_data_buffers(&mut self) {
-        for node in self.llir_graph.node_indices() {
-            if let Some(input) = self.llir_graph[node].to_op::<Input>() {
-                let hlir_id = NodeIndex::new(input.node);
-                if let Some(data) = self.input_data.get(&hlir_id) {
-                    let buffer = self.create_input_buffer(data, input.dtype);
-                    self.hlir_buffers.insert(hlir_id, buffer);
-                }
-            }
-        }
-    }
-
-    fn select_bucket(&mut self, dyn_map: &FxHashMap<char, usize>) {
-        if self.compiled_buckets.len() <= 1 {
-            return;
-        }
-
-        let index = self.resolve_bucket(dyn_map);
-        if index != self.active_bucket {
-            self.activate_bucket(index);
-        }
-    }
-
-    fn resolve_bucket(&self, dyn_map: &FxHashMap<char, usize>) -> usize {
-        self.compiled_buckets
-            .iter()
-            .position(|bucket| {
-                self.dim_buckets.iter().all(|(dim, buckets)| {
-                    let value = dyn_map.get(dim).copied().unwrap_or(0);
-                    let bucket_index = bucket.bucket_indices.get(dim).copied().unwrap_or(0);
-                    buckets
-                        .get(bucket_index)
-                        .map(|bucket| bucket.contains(value))
-                        .unwrap_or(true)
-                })
-            })
-            .unwrap_or_else(|| {
-                panic!(
-                    "No Metal bucket matches dyn_map {:?}. Defined buckets: {:?}",
-                    dyn_map, self.dim_buckets
-                )
-            })
-    }
-
    fn update_dyn_buffer(&mut self, dyn_map: &FxHashMap<char, usize>) {
        let ptr = self.dyn_buffer.contents() as *mut i32;
        unsafe {
@@ -702,99 +473,87 @@ impl MetalRuntime {

    /// Execute and return GPU-side execution time in microseconds.
    fn execute_timed(&mut self, dyn_map: &FxHashMap<char, usize>) -> (f64, TimingMethod) {
-        autoreleasepool(|| {
-            self.select_bucket(dyn_map);
-            self.allocate_active_intermediate_buffers(dyn_map);
-
-            let llir_to_hlir: FxHashMap<NodeIndex, NodeIndex> = self
-                .llir_graph
-                .node_indices()
-                .filter_map(|n| {
-                    if let Some(Input { node, .. }) = self.llir_graph[n].to_op::<Input>() {
-                        Some((n, NodeIndex::new(*node)))
-                    } else {
-                        None
-                    }
-                })
-                .collect();
-
-            let topo_order = toposort(&self.llir_graph, None).expect("Graph has cycles!");
-
-            self.update_dyn_buffer(dyn_map);
-            let command_buffer = self.command_queue.new_command_buffer();
-
-            for node in topo_order {
-                if self.llir_graph[node].to_op::<Input>().is_some()
-                    || self.llir_graph[node].to_op::<Output>().is_some()
-                {
-                    continue;
+        let llir_to_hlir: FxHashMap<NodeIndex, NodeIndex> = self
+            .llir_graph
+            .node_indices()
+            .filter_map(|n| {
+                if let Some(Input { node, .. }) = self.llir_graph[n].to_op::<Input>() {
+                    Some((n, NodeIndex::new(*node)))
+                } else {
+                    None
                }
+            })
+            .collect();

-                if let Some(kernel_op) = self.llir_graph[node].to_dialect::<dyn MetalKernelOp>() {
-                    let pipeline = self.pipelines.get(&node);
+        let topo_order = toposort(&self.llir_graph, None).expect("Graph has cycles!");

-                    let input_nodes: Vec<NodeIndex> = self
-                        .llir_graph
-                        .edges_directed(node, Direction::Incoming)
-                        .sorted_by_key(|e| e.id())
-                        .map(|e| e.source())
-                        .collect();
+        self.update_dyn_buffer(dyn_map);
+        let command_buffer = self.command_queue.new_command_buffer();
+        let encoder = command_buffer.new_compute_command_encoder();

-                    let input_buffers: Vec<&Buffer> = input_nodes
-                        .iter()
-                        .map(|&n| self.buffer_for_llir_node(n, &llir_to_hlir))
-                        .collect();
-                    let input_dtypes: Vec<DType> = input_nodes
-                        .iter()
-                        .map(|n| {
-                            self.node_dtypes
-                                .get(n)
-                                .copied()
-                                .unwrap_or_else(|| panic!("Missing inferred dtype for node {n:?}"))
-                        })
-                        .collect();
-
-                    let output_buffer = if let Some(alias_idx) = kernel_op.output_aliases_input() {
-                        input_buffers[alias_idx]
-                    } else {
-                        self.buffers
-                            .get(&node)
-                            .expect("Output buffer not allocated!")
-                    };
-                    let output_dtype = self.node_dtypes.get(&node).copied().unwrap_or(DType::F32);
-
-                    kernel_op.encode(
-                        command_buffer,
-                        pipeline,
-                        &input_buffers,
-                        output_buffer,
-                        dyn_map,
-                        &self.dyn_buffer,
-                        &input_dtypes,
-                        output_dtype,
-                    );
-                }
+        for node in topo_order {
+            if self.llir_graph[node].to_op::<Input>().is_some()
+                || self.llir_graph[node].to_op::<Output>().is_some()
+            {
+                continue;
            }

-            command_buffer.commit();
-            command_buffer.wait_until_completed();
+            if let Some(kernel_op) = self.llir_graph[node].to_dialect::<dyn MetalKernelOp>() {
+                let pipeline = self.pipelines.get(&node).expect("Pipeline not compiled!");

-            // gpuStartTime and gpuEndTime are available on macOS 10.15+
-            let gpu_start: f64 = unsafe {
-                use objc::{msg_send, sel, sel_impl};
-                let ptr = command_buffer as *const _ as *mut Object;
-                msg_send![ptr, GPUStartTime]
-            };
-            let gpu_end: f64 = unsafe {
-                use objc::{msg_send, sel, sel_impl};
-                let ptr = command_buffer as *const _ as *mut Object;
-                msg_send![ptr, GPUEndTime]
-            };
+                let input_nodes: Vec<NodeIndex> = self
+                    .llir_graph
+                    .edges_directed(node, Direction::Incoming)
+                    .sorted_by_key(|e| e.id())
+                    .map(|e| e.source())
+                    .collect();

-            let gpu_time_seconds = gpu_end - gpu_start;
-            let gpu_time_us = gpu_time_seconds * 1_000_000.0;
+                let input_buffers: Vec<&Buffer> = input_nodes
+                    .iter()
+                    .map(|&n| {
+                        if let Some(hlir_node) = llir_to_hlir.get(&n) {
+                            self.hlir_buffers
+                                .get(hlir_node)
+                                .expect("Input buffer not set!")
+                        } else {
+                            self.buffers
+                                .get(&n)
+                                .expect("Intermediate buffer not found!")
+                        }
+                    })
+                    .collect();

-            (gpu_time_us, TimingMethod::DeviceTimestamp)
-        })
+                let output_buffer = self
+                    .buffers
+                    .get(&node)
+                    .expect("Output buffer not allocated!");
+
+                let dyn_idx = input_buffers.len() as u64 + 1;
+                encoder.set_buffer(dyn_idx, Some(&self.dyn_buffer), 0);
+
+                kernel_op.encode(encoder, pipeline, &input_buffers, output_buffer, dyn_map);
+            }
+        }
+
+        encoder.end_encoding();
+        command_buffer.commit();
+        command_buffer.wait_until_completed();
+
+        // gpuStartTime and gpuEndTime are available on macOS 10.15+
+        let gpu_start: f64 = unsafe {
+            use objc::{msg_send, sel, sel_impl};
+            let ptr = command_buffer as *const _ as *mut Object;
+            msg_send![ptr, GPUStartTime]
+        };
+        let gpu_end: f64 = unsafe {
+            use objc::{msg_send, sel, sel_impl};
+            let ptr = command_buffer as *const _ as *mut Object;
+            msg_send![ptr, GPUEndTime]
+        };
+
+        let gpu_time_seconds = gpu_end - gpu_start;
+        let gpu_time_us = gpu_time_seconds * 1_000_000.0;
+
+        (gpu_time_us, TimingMethod::DeviceTimestamp)
    }
 }
--- a/crates/luminal_metal/src/tests.rs
+++ b/crates/luminal_metal/src/tests.rs
@@ -1,16 +1,8 @@
 use crate::{kernel::lower_expression_for_metal, runtime::MetalRuntime};
 use candle_core::{Device as CandleDevice, Tensor as CandleTensor};
-use half::{bf16, f16};
+use half::f16;
 use luminal::prelude::*;
 use proptest::prelude::*;
-use safetensors::{Dtype, tensor::TensorView};
-use std::{
-    collections::HashMap,
-    path::PathBuf,
-    sync::atomic::{AtomicUsize, Ordering},
-};
-
-static SAFETENSORS_TEST_FILE_ID: AtomicUsize = AtomicUsize::new(0);

 fn assert_close(actual: &[f32], expected: &[f32], tolerance: f32) {
    assert_eq!(
@@ -34,32 +26,6 @@ fn assert_close(actual: &[f32], expected: &[f32], tolerance: f32) {
    }
 }

-fn bytes_of<T: bytemuck::NoUninit>(values: &[T]) -> Vec<u8> {
-    bytemuck::cast_slice(values).to_vec()
-}
-
-fn write_test_safetensors(tensors: &[(&str, Dtype, Vec<usize>, Vec<u8>)]) -> PathBuf {
-    let tensor_views: HashMap<String, TensorView<'_>> = tensors
-        .iter()
-        .map(|(name, dtype, shape, data)| {
-            (
-                (*name).to_string(),
-                TensorView::new(*dtype, shape.clone(), data).unwrap(),
-            )
-        })
-        .collect();
-    let serialized = safetensors::serialize(&tensor_views, None).unwrap();
-    let id = SAFETENSORS_TEST_FILE_ID.fetch_add(1, Ordering::Relaxed);
-    let mut path = std::env::temp_dir();
-    path.push(format!(
-        "luminal_metal_runtime_{}_{}.safetensors",
-        std::process::id(),
-        id
-    ));
-    std::fs::write(&path, serialized).unwrap();
-    path
-}
-
 const TRANSFORMER_SEQ: usize = 4;
 const TRANSFORMER_HIDDEN: usize = 16;
 const TRANSFORMER_INTERMEDIATE: usize = 32;
@@ -284,53 +250,6 @@ fn dynamic_dim_sum_reduce_runs() {
    assert_close(&out, &[9.0, 12.0], 0.001);
 }

-#[test]
-fn metal_bucketed_dynamic_dim_dispatches_correct_graph() {
-    let mut cx = Graph::default();
-    let input = cx.tensor(('s', 4));
-    let output = (input + input).output();
-
-    cx.set_dim_buckets('s', &[DimBucket::new(1, 1), DimBucket::new(2, 4)]);
-    cx.set_dim('s', 1);
-    cx.build_search_space::<MetalRuntime>();
-
-    let mut rt = MetalRuntime::initialize(());
-    rt.set_data(input, vec![1.0f32; 4]);
-    rt = cx.search(rt, 5);
-
-    cx.set_dim('s', 1);
-    let s1_input = vec![1.0, 2.0, 3.0, 4.0];
-    rt.set_data(input, s1_input.clone());
-    rt.execute(&cx.dyn_map);
-    let s1_out = rt.get_f32(output);
-    assert_close(&s1_out[..4], &[2.0, 4.0, 6.0, 8.0], 0.001);
-
-    cx.set_dim('s', 3);
-    let s3_input: Vec<f32> = (0..12).map(|i| i as f32).collect();
-    let s3_expected: Vec<f32> = s3_input.iter().map(|v| v * 2.0).collect();
-    rt.set_data(input, s3_input);
-    rt.execute(&cx.dyn_map);
-    let s3_out = rt.get_f32(output);
-    assert_close(&s3_out[..12], &s3_expected, 0.001);
-}
-
-#[test]
-fn metal_int_arithmetic_preserves_large_values() {
-    let mut cx = Graph::default();
-    let token = cx.tensor(1).as_dtype(DType::Int);
-    let large_index = (token * 1024) + 123;
-    let mod_output = (large_index % 65_537).output();
-
-    cx.build_search_space::<MetalRuntime>();
-    let mut rt = MetalRuntime::initialize(());
-    rt.set_data(token, &[16_385i32]);
-    rt = cx.search(rt, 1);
-    rt.allocate_intermediate_buffers(&cx.dyn_map);
-    rt.execute(&cx.dyn_map);
-
-    assert_eq!(rt.get_f32(mod_output), vec![891.0]);
-}
-
 proptest! {
    #![proptest_config(ProptestConfig::with_cases(5))]

@@ -709,13 +628,8 @@ fn metal_regular_tiled_matmul_path() {

    let kernels = rt.debug_kernel_ops();
    assert!(
-        kernels.iter().any(|k| k.contains("MPSMatmul")),
-        "expected MPS matmul path, kernels: {:?}",
-        kernels
-    );
-    assert!(
-        !kernels.iter().any(|k| k.contains("GenericMatmul")),
-        "MPS-compatible matmul should not extract the generic fallback, kernels: {:?}",
+        kernels.iter().any(|k| k.contains("family: RegularTiled")),
+        "expected regular tiled matmul path, kernels: {:?}",
        kernels
    );

@@ -733,287 +647,6 @@ fn metal_regular_tiled_matmul_path() {
    assert_close(&result, &expected, 2e-3);
 }

-#[test]
-fn metal_mps_matmul_transposed_rhs_weight_layout() {
-    let mut cx = Graph::default();
-    let m = 7;
-    let k = 11;
-    let n = 13;
-    let a = cx.tensor((m, k));
-    let weight = cx.tensor((n, k));
-    let output = a.matmul(weight.t()).output();
-
-    cx.build_search_space::<MetalRuntime>();
-    let mut rt = MetalRuntime::initialize(());
-
-    let a_data = seeded_data(m * k, 0.35, -0.17);
-    let weight_data = seeded_data(n * k, 0.21, -0.09);
-
-    rt.set_data(a, &a_data);
-    rt.set_data(weight, &weight_data);
-    rt = cx.search(rt, 1);
-
-    let kernels = rt.debug_kernel_ops();
-    assert!(
-        kernels.iter().any(|k| k.contains("transpose_rhs: true")),
-        "expected MPS matmul to cover transposed row-major RHS, kernels: {:?}",
-        kernels
-    );
-
-    rt.allocate_intermediate_buffers(&cx.dyn_map);
-    rt.execute(&cx.dyn_map);
-
-    let result = rt.get_f32(output);
-
-    let device = CandleDevice::Cpu;
-    let ref_a = CandleTensor::from_vec(a_data, (m, k), &device).unwrap();
-    let ref_weight = CandleTensor::from_vec(weight_data, (n, k), &device).unwrap();
-    let expected = ref_a.matmul(&ref_weight.t().unwrap()).unwrap();
-    let expected: Vec<f32> = expected.flatten_all().unwrap().to_vec1().unwrap();
-
-    assert_close(&result, &expected, 1e-3);
-}
-
-#[test]
-fn metal_mps_matmul_transposed_lhs_layout() {
-    let mut cx = Graph::default();
-    let m = 5;
-    let k = 9;
-    let n = 6;
-    let lhs_storage = cx.tensor((k, m));
-    let rhs = cx.tensor((k, n));
-    let output = lhs_storage.t().matmul(rhs).output();
-
-    cx.build_search_space::<MetalRuntime>();
-    let mut rt = MetalRuntime::initialize(());
-
-    let lhs_data = seeded_data(k * m, 0.31, -0.12);
-    let rhs_data = seeded_data(k * n, 0.27, -0.08);
-
-    rt.set_data(lhs_storage, &lhs_data);
-    rt.set_data(rhs, &rhs_data);
-    rt = cx.search(rt, 1);
-
-    let kernels = rt.debug_kernel_ops();
-    assert!(
-        kernels.iter().any(|k| k.contains("transpose_lhs: true")),
-        "expected MPS matmul to cover transposed row-major LHS, kernels: {:?}",
-        kernels
-    );
-
-    rt.allocate_intermediate_buffers(&cx.dyn_map);
-    rt.execute(&cx.dyn_map);
-
-    let result = rt.get_f32(output);
-
-    let device = CandleDevice::Cpu;
-    let ref_lhs = CandleTensor::from_vec(lhs_data, (k, m), &device)
-        .unwrap()
-        .t()
-        .unwrap();
-    let ref_rhs = CandleTensor::from_vec(rhs_data, (k, n), &device).unwrap();
-    let expected = ref_lhs.matmul(&ref_rhs).unwrap();
-    let expected: Vec<f32> = expected.flatten_all().unwrap().to_vec1().unwrap();
-
-    assert_close(&result, &expected, 1e-3);
-}
-
-#[test]
-fn metal_mps_batched_matmul_row_row_layout() {
-    let mut cx = Graph::default();
-    let batch = 3;
-    let m = 4;
-    let k = 5;
-    let n = 6;
-    let a = cx.tensor((batch, m, k));
-    let b = cx.tensor((batch, k, n));
-    let output = a.matmul(b).output();
-
-    cx.build_search_space::<MetalRuntime>();
-    let mut rt = MetalRuntime::initialize(());
-
-    let a_data = seeded_data(batch * m * k, 0.17, -0.08);
-    let b_data = seeded_data(batch * k * n, 0.11, -0.05);
-    rt.set_data(a, &a_data);
-    rt.set_data(b, &b_data);
-    rt = cx.search(rt, 1);
-
-    let kernels = rt.debug_kernel_ops();
-    assert!(
-        kernels.iter().any(|k| k.contains("MPSBatchedMatmul")),
-        "expected MPS batched matmul path, kernels: {:?}",
-        kernels
-    );
-
-    rt.allocate_intermediate_buffers(&cx.dyn_map);
-    rt.execute(&cx.dyn_map);
-    let result = rt.get_f32(output);
-
-    let mut expected = vec![0.0; batch * m * n];
-    for batch_idx in 0..batch {
-        for row in 0..m {
-            for col in 0..n {
-                let mut sum = 0.0;
-                for inner in 0..k {
-                    sum += a_data[batch_idx * m * k + row * k + inner]
-                        * b_data[batch_idx * k * n + inner * n + col];
-                }
-                expected[batch_idx * m * n + row * n + col] = sum;
-            }
-        }
-    }
-
-    assert_close(&result, &expected, 1e-3);
-}
-
-#[test]
-fn metal_generic_matmul_covers_noncontiguous_merged_head_projection() {
-    let mut cx = Graph::default();
-    let heads = 3;
-    let seq = 4;
-    let head_dim = 5;
-    let hidden = heads * head_dim;
-    let out_dim = 7;
-    let attn = cx.tensor((heads, seq, head_dim));
-    let weight = cx.tensor((out_dim, hidden));
-    let merged = attn.transpose(0, 1).merge_dims(1, 2);
-    let output = merged.matmul(weight.t()).output();
-
-    cx.build_search_space::<MetalRuntime>();
-    let mut rt = MetalRuntime::initialize(());
-
-    let attn_data = seeded_data(heads * seq * head_dim, 0.19, -0.09);
-    let weight_data = seeded_data(out_dim * hidden, 0.14, -0.06);
-    rt.set_data(attn, &attn_data);
-    rt.set_data(weight, &weight_data);
-    rt = cx.search(rt, 1);
-
-    let kernels = rt.debug_kernel_ops();
-    assert!(
-        kernels.iter().any(|k| k.contains("GenericMatmul")),
-        "expected generic matmul fallback for non-contiguous merged-head projection, kernels: {:?}",
-        kernels
-    );
-    assert!(
-        !kernels.iter().any(|k| {
-            k.contains("MetalMul") && k.contains(&format!("shape: [{seq}, {out_dim}, {hidden}]"))
-        }),
-        "generic fallback should remove the broadcast multiply intermediate, kernels: {:?}",
-        kernels
-    );
-
-    rt.allocate_intermediate_buffers(&cx.dyn_map);
-    rt.execute(&cx.dyn_map);
-    let result = rt.get_f32(output);
-
-    let mut expected = vec![0.0; seq * out_dim];
-    for token in 0..seq {
-        for out_col in 0..out_dim {
-            let mut sum = 0.0;
-            for inner in 0..hidden {
-                let head = inner / head_dim;
-                let dim = inner % head_dim;
-                let attn_idx = head * seq * head_dim + token * head_dim + dim;
-                sum += attn_data[attn_idx] * weight_data[out_col * hidden + inner];
-            }
-            expected[token * out_dim + out_col] = sum;
-        }
-    }
-
-    assert_close(&result, &expected, 1e-3);
-}
-
-#[test]
-fn metal_mps_batched_matmul_transposed_rhs_layout() {
-    let mut cx = Graph::default();
-    let batch = 4;
-    let m = 3;
-    let k = 7;
-    let n = 5;
-    let a = cx.tensor((batch, m, k));
-    let weight = cx.tensor((batch, n, k));
-    let output = a.matmul(weight.permute((0, 2, 1))).output();
-
-    cx.build_search_space::<MetalRuntime>();
-    let mut rt = MetalRuntime::initialize(());
-
-    let a_data = seeded_data(batch * m * k, 0.13, -0.06);
-    let weight_data = seeded_data(batch * n * k, 0.09, -0.04);
-    rt.set_data(a, &a_data);
-    rt.set_data(weight, &weight_data);
-    rt = cx.search(rt, 1);
-
-    let kernels = rt.debug_kernel_ops();
-    assert!(
-        kernels
-            .iter()
-            .any(|k| k.contains("MPSBatchedMatmul") && k.contains("transpose_rhs: true")),
-        "expected MPS batched matmul transposed RHS path, kernels: {:?}",
-        kernels
-    );
-
-    rt.allocate_intermediate_buffers(&cx.dyn_map);
-    rt.execute(&cx.dyn_map);
-    let result = rt.get_f32(output);
-
-    let mut expected = vec![0.0; batch * m * n];
-    for batch_idx in 0..batch {
-        for row in 0..m {
-            for col in 0..n {
-                let mut sum = 0.0;
-                for inner in 0..k {
-                    sum += a_data[batch_idx * m * k + row * k + inner]
-                        * weight_data[batch_idx * n * k + col * k + inner];
-                }
-                expected[batch_idx * m * n + row * n + col] = sum;
-            }
-        }
-    }
-
-    assert_close(&result, &expected, 1e-3);
-}
-
-#[test]
-fn metal_mps_matmul_f16_transposed_rhs_weight_layout() {
-    let mut cx = Graph::default();
-    let m = 6;
-    let k = 10;
-    let n = 7;
-    let a = cx.tensor((m, k)).as_dtype(DType::F16);
-    let weight = cx.tensor((n, k)).as_dtype(DType::F16);
-    let output = a.matmul(weight.t()).cast(DType::F32).output();
-
-    cx.build_search_space::<MetalRuntime>();
-    let mut rt = MetalRuntime::initialize(());
-
-    let a_data = seeded_data(m * k, 0.22, -0.07);
-    let weight_data = seeded_data(n * k, 0.18, -0.05);
-
-    rt.set_data(a, to_f16_vec(&a_data));
-    rt.set_data(weight, to_f16_vec(&weight_data));
-    rt = cx.search(rt, 1);
-
-    let kernels = rt.debug_kernel_ops();
-    assert!(
-        kernels.iter().any(|k| k.contains("transpose_rhs: true")),
-        "expected MPS F16 matmul to cover transposed row-major RHS, kernels: {:?}",
-        kernels
-    );
-
-    rt.allocate_intermediate_buffers(&cx.dyn_map);
-    rt.execute(&cx.dyn_map);
-
-    let result = rt.get_f32(output);
-
-    let device = CandleDevice::Cpu;
-    let ref_a = CandleTensor::from_vec(a_data, (m, k), &device).unwrap();
-    let ref_weight = CandleTensor::from_vec(weight_data, (n, k), &device).unwrap();
-    let expected = ref_a.matmul(&ref_weight.t().unwrap()).unwrap();
-    let expected: Vec<f32> = expected.flatten_all().unwrap().to_vec1().unwrap();
-
-    assert_close(&result, &expected, 5e-3);
-}
-
 #[test]
 fn metal_rms_norm() {
    let mut cx = Graph::default();
@@ -1338,153 +971,6 @@ fn test_scatter_basic() {
    assert_close(&out, &[0.0, 10.0, 0.0, 20.0, 30.0], 0.001);
 }

-#[test]
-fn test_scatter_buffer_roundtrip() {
-    let mut cx = Graph::default();
-    let src = cx.tensor(1);
-    let indexes = cx.tensor(1).as_dtype(DType::Int);
-    let cache = cx.tensor(4).persist();
-    let cache_out = src.scatter(indexes, cache);
-    let read = cache_out.output();
-
-    cx.build_search_space::<MetalRuntime>();
-    let mut rt = MetalRuntime::initialize(());
-    rt.set_data(src, &[0.0]);
-    rt.set_data(indexes, &[0.0]);
-    rt.set_zeros(cache, 4 * std::mem::size_of::<f32>());
-    rt = cx.search(rt, 1);
-
-    for (pos, value, expected) in [
-        (0, 10.0, [10.0, 0.0, 0.0, 0.0]),
-        (1, 20.0, [10.0, 20.0, 0.0, 0.0]),
-        (2, 30.0, [10.0, 20.0, 30.0, 0.0]),
-    ] {
-        rt.set_data(src, &[value]);
-        rt.set_data(indexes, &[pos as f32]);
-        rt.allocate_intermediate_buffers(&cx.dyn_map);
-        rt.execute(&cx.dyn_map);
-        assert_close(&rt.get_f32(read), &expected, 0.001);
-
-        let updated_cache = rt.remove_buffer(cache_out);
-        rt.set_buffer(cache, updated_cache);
-    }
-}
-
-#[test]
-fn test_load_safetensors_f32_survives_search_and_overrides_input_data() {
-    let mut cx = Graph::default();
-    let weights = cx.named_tensor("weights", 3);
-    let bias = cx.named_tensor("bias", 3);
-    let out = (weights + bias).output();
-
-    let weight_values = [1.25f32, -2.5, 4.0];
-    let tensors = [("weights", Dtype::F32, vec![3], bytes_of(&weight_values))];
-    let path = write_test_safetensors(&tensors);
-
-    cx.build_search_space::<MetalRuntime>();
-    let mut rt = MetalRuntime::initialize(());
-    rt.set_data(weights, &[99.0, 99.0, 99.0]);
-    rt.set_data(bias, &[0.5, 1.0, -1.5]);
-    rt.load_safetensors(&cx, path.to_str().unwrap());
-    rt = cx.search(rt, 1);
-    rt.allocate_intermediate_buffers(&cx.dyn_map);
-    rt.execute(&cx.dyn_map);
-
-    assert_close(&rt.get_f32(out), &[1.75, -1.5, 2.5], 0.001);
-    std::fs::remove_file(path).ok();
-}
-
-#[test]
-fn test_load_safetensors_converts_supported_float_dtypes() {
-    let mut cx = Graph::default();
-    let f16_to_f32 = cx.named_tensor("f16_to_f32", 2);
-    let bf16_to_f32 = cx.named_tensor("bf16_to_f32", 2);
-    let f16_to_f16 = cx.named_tensor("f16_to_f16", 2).as_dtype(DType::F16);
-    let f32_to_f16 = cx.named_tensor("f32_to_f16", 2).as_dtype(DType::F16);
-    let bf16_to_f16 = cx.named_tensor("bf16_to_f16", 2).as_dtype(DType::F16);
-
-    let f16_to_f32_out = (f16_to_f32 + 0.0).output();
-    let bf16_to_f32_out = (bf16_to_f32 + 0.0).output();
-    let f16_to_f16_out = f16_to_f16.cast(DType::F32).output();
-    let f32_to_f16_out = f32_to_f16.cast(DType::F32).output();
-    let bf16_to_f16_out = bf16_to_f16.cast(DType::F32).output();
-
-    let f16_to_f32_values = [f16::from_f32(1.5), f16::from_f32(-2.25)];
-    let bf16_to_f32_values = [bf16::from_f32(3.5), bf16::from_f32(-4.25)];
-    let f16_to_f16_values = [f16::from_f32(5.5), f16::from_f32(-6.25)];
-    let f32_to_f16_values = [7.5f32, -8.25];
-    let bf16_to_f16_values = [bf16::from_f32(9.5), bf16::from_f32(-10.25)];
-    let tensors = [
-        (
-            "f16_to_f32",
-            Dtype::F16,
-            vec![2],
-            bytes_of(&f16_to_f32_values),
-        ),
-        (
-            "bf16_to_f32",
-            Dtype::BF16,
-            vec![2],
-            bytes_of(&bf16_to_f32_values),
-        ),
-        (
-            "f16_to_f16",
-            Dtype::F16,
-            vec![2],
-            bytes_of(&f16_to_f16_values),
-        ),
-        (
-            "f32_to_f16",
-            Dtype::F32,
-            vec![2],
-            bytes_of(&f32_to_f16_values),
-        ),
-        (
-            "bf16_to_f16",
-            Dtype::BF16,
-            vec![2],
-            bytes_of(&bf16_to_f16_values),
-        ),
-    ];
-    let path = write_test_safetensors(&tensors);
-
-    cx.build_search_space::<MetalRuntime>();
-    let mut rt = MetalRuntime::initialize(());
-    rt.load_safetensors(&cx, path.to_str().unwrap());
-    rt = cx.search(rt, 1);
-    rt.allocate_intermediate_buffers(&cx.dyn_map);
-    rt.execute(&cx.dyn_map);
-
-    assert_close(&rt.get_f32(f16_to_f32_out), &[1.5, -2.25], 0.001);
-    assert_close(&rt.get_f32(bf16_to_f32_out), &[3.5, -4.25], 0.001);
-    assert_close(&rt.get_f32(f16_to_f16_out), &[5.5, -6.25], 0.001);
-    assert_close(&rt.get_f32(f32_to_f16_out), &[7.5, -8.25], 0.001);
-    assert_close(&rt.get_f32(bf16_to_f16_out), &[9.5, -10.25], 0.001);
-    std::fs::remove_file(path).ok();
-}
-
-#[test]
-fn test_gather_noncontiguous_data_uses_data_shape() {
-    let mut cx = Graph::default();
-    let input = cx.tensor((4, 3));
-    let data = input.transpose(0, 1);
-    let indexes = cx.tensor((2, 2)).as_dtype(DType::Int);
-    let out = data.gather(indexes).output();
-
-    cx.build_search_space::<MetalRuntime>();
-    let mut rt = MetalRuntime::initialize(());
-    rt.set_data(
-        input,
-        &[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0],
-    );
-    rt.set_data(indexes, &[0.0, 3.0, 4.0, 7.0]);
-    rt = cx.search(rt, 1);
-    rt.allocate_intermediate_buffers(&cx.dyn_map);
-    rt.execute(&cx.dyn_map);
-
-    assert_close(&rt.get_f32(out), &[0.0, 9.0, 1.0, 10.0], 0.001);
-}
-
 #[test]
 fn test_scatter_into_nonzero_dest() {
    let mut cx = Graph::default();
@@ -1499,12 +985,6 @@ fn test_scatter_into_nonzero_dest() {
    rt.set_data(indexes, &[2f32]);
    rt.set_data(dest, &[1.0, 2.0, 3.0, 4.0, 5.0]);
    rt = cx.search(rt, 1);
-    let kernels = rt.debug_kernel_ops();
-    assert!(
-        kernels.iter().any(|k| k.contains("MetalScatterNoCopy")),
-        "expected no-copy scatter for consumed destination, kernels: {:?}",
-        kernels
-    );
    rt.allocate_intermediate_buffers(&cx.dyn_map);
    rt.execute(&cx.dyn_map);

@@ -1512,89 +992,6 @@ fn test_scatter_into_nonzero_dest() {
    assert_close(&out, &[1.0, 2.0, 99.0, 4.0, 5.0], 0.001);
 }

-#[test]
-fn test_scatter_no_copy_remove_buffer_aliases_dest() {
-    let mut cx = Graph::default();
-    let src = cx.tensor(2);
-    let indexes = cx.tensor(2).as_dtype(DType::Int);
-    let dest = cx.tensor(5);
-    let result = src.scatter(indexes, dest).output();
-
-    cx.build_search_space::<MetalRuntime>();
-    let mut rt = MetalRuntime::initialize(());
-    rt.set_data(src, &[7.0, 8.0]);
-    rt.set_data(indexes, &[1.0, 3.0]);
-    rt.set_data(dest, &[10.0, 20.0, 30.0, 40.0, 50.0]);
-    rt = cx.search(rt, 1);
-    rt.allocate_intermediate_buffers(&cx.dyn_map);
-    rt.execute(&cx.dyn_map);
-
-    let moved = rt.remove_buffer(result);
-    let moved_values = unsafe {
-        std::slice::from_raw_parts(
-            moved.contents() as *const f32,
-            moved.length() as usize / std::mem::size_of::<f32>(),
-        )
-        .to_vec()
-    };
-    assert_close(&moved_values, &[10.0, 7.0, 30.0, 8.0, 50.0], 0.001);
-    rt.set_buffer(dest.id, moved);
-}
-
-#[test]
-fn test_scatter_no_copy_handles_2d_destination() {
-    let mut cx = Graph::default();
-    let src = cx.tensor(2);
-    let indexes = cx.tensor(2).as_dtype(DType::Int);
-    let dest = cx.tensor((2, 3));
-    let result = src.scatter(indexes, dest).output();
-
-    cx.build_search_space::<MetalRuntime>();
-    let mut rt = MetalRuntime::initialize(());
-    rt.set_data(src, &[9.0, 8.0]);
-    rt.set_data(indexes, &[2.0, 4.0]);
-    rt.set_data(dest, &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0]);
-    rt = cx.search(rt, 1);
-    let kernels = rt.debug_kernel_ops();
-    assert!(
-        kernels.iter().any(|k| k.contains("MetalScatterNoCopy")),
-        "expected no-copy scatter for 2D destination, kernels: {:?}",
-        kernels
-    );
-    rt.allocate_intermediate_buffers(&cx.dyn_map);
-    rt.execute(&cx.dyn_map);
-
-    assert_close(&rt.get_f32(result), &[1.0, 2.0, 9.0, 4.0, 8.0, 6.0], 0.001);
-}
-
-#[test]
-fn test_scatter_no_copy_not_selected_when_dest_has_another_consumer() {
-    let mut cx = Graph::default();
-    let src = cx.tensor(1);
-    let indexes = cx.tensor(1).as_dtype(DType::Int);
-    let dest = cx.tensor(4);
-    let scatter = src.scatter(indexes, dest).output();
-    let dest_plus_one = (dest + 1.0).output();
-
-    cx.build_search_space::<MetalRuntime>();
-    let mut rt = MetalRuntime::initialize(());
-    rt.set_data(src, &[99.0]);
-    rt.set_data(indexes, &[1.0]);
-    rt.set_data(dest, &[10.0, 20.0, 30.0, 40.0]);
-    rt = cx.search(rt, 1);
-    let kernels = rt.debug_kernel_ops();
-    assert!(
-        !kernels.iter().any(|k| k.contains("MetalScatterNoCopy")),
-        "no-copy scatter should not be selected when dest is also consumed, kernels: {:?}",
-        kernels
-    );
-    rt.allocate_intermediate_buffers(&cx.dyn_map);
-    rt.execute(&cx.dyn_map);
-
-    assert_close(&rt.get_f32(scatter), &[10.0, 99.0, 30.0, 40.0], 0.001);
-    assert_close(&rt.get_f32(dest_plus_one), &[11.0, 21.0, 31.0, 41.0], 0.001);
-}
-
 #[test]
 fn test_scatter_all_positions() {
    let mut cx = Graph::default();
@@ -1615,21 +1012,3 @@ fn test_scatter_all_positions() {
    let out = rt.get_f32(result);
    assert_close(&out, &[10.0, 20.0, 30.0, 40.0], 0.001);
 }
-
-#[test]
-fn test_gather_preserves_data_dtype() {
-    let mut cx = Graph::default();
-    let data = cx.tensor(2);
-    let indexes = cx.tensor(1).as_dtype(DType::Int);
-    let out = data.gather(indexes).output();
-
-    cx.build_search_space::<MetalRuntime>();
-    let mut rt = MetalRuntime::initialize(());
-    rt.set_data(data, &[1.25, 2.5]);
-    rt.set_data(indexes, &[1.0]);
-    rt = cx.search(rt, 1);
-    rt.allocate_intermediate_buffers(&cx.dyn_map);
-    rt.execute(&cx.dyn_map);
-
-    assert_close(&rt.get_f32(out), &[2.5], 0.001);
-}
--- a/crates/luminal_nn/Cargo.toml
+++ b/crates/luminal_nn/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "luminal_nn"
 version = "0.1.0"
-edition = "2024"
+edition = "2021"

 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

--- a/crates/luminal_nn/src/moe.rs
+++ b/crates/luminal_nn/src/moe.rs
@@ -61,8 +61,7 @@ impl MoE {
        let expert_out = expanded_act.matmul(gathered).squeeze(n); // [batch.., k, out]

        // 6. Weighted sum over experts: [batch.., k, out] * [batch.., k, 1] → sum(k) → [batch.., out]
-        let mut weights_exp = top_k_values.unsqueeze(top_k_values.dims().len()); // [batch.., k, 1]
-        weights_exp.shape.expand(expert_out.dims());
+        let weights_exp = top_k_values.unsqueeze(top_k_values.dims().len()); // [batch.., k, 1]
        (expert_out * weights_exp).sum(n - 1)
    }
 }
@@ -71,7 +70,7 @@ impl MoE {
 mod tests {
    use super::MoE;
    use luminal::prelude::*;
-    use rand::{Rng, rng};
+    use rand::{rng, Rng};

    fn random_vec(n: usize) -> Vec<f32> {
        let mut r = rng();
@@ -479,8 +478,7 @@ mod tests {
        let down_out = hidden_exp.matmul(down_gathered.transpose(2, 3)).squeeze(2); // [s, k, H]

        // 7. Weighted sum over k experts → [s, H]
-        let mut weights_exp = top_k_values.unsqueeze(top_k_values.dims().len()); // [s, k, 1]
-        weights_exp.shape.expand(down_out.dims());
+        let weights_exp = top_k_values.unsqueeze(top_k_values.dims().len()); // [s, k, 1]
        let _output = (down_out * weights_exp).sum(n - 1).output();

        // Dump the HLIR to egglog
--- a/crates/luminal_python/LessonsLearned.md
+++ b/crates/luminal_python/LessonsLearned.md
@@ -749,6 +749,92 @@ candidates rejected" during search, check whether the rejection is from actual f
 or from dtype misinterpretation — the key diagnostic is whether the NaN pattern is
 identical across all attempts (dtype issue) vs varying (actual numerical issue).

+## 2026-04-22 — Benchmark python_luminal Path: NativeRuntime Panic on CUDA Weights
+
+### What the symptom was
+
+Running `benchmarks/ttft/run.py` with the `python_luminal` path panicked deep in Rust:
+
+```
+thread panicked at src/hlir.rs:2239:40: no entry found for key
+```
+
+The panic occurred in `NativeRuntime::execute` when the `Output` node tried to read its
+predecessor's buffer from `self.buffers` — and the buffer wasn't there.
+
+### What the actual root cause was
+
+The luminal Python wheel was built without `--features cuda` (plain `maturin build --release`).
+This means `_cuda_lite_factory_capsule` is not compiled into the `.so` file. In `main.py`,
+`_detect_factory_capsule` catches the resulting `ImportError` and **silently** falls back to
+`_native_factory_capsule` (NativeRuntime / CPU runtime).
+
+The benchmark model (`LlamaForCausalLM.from_pretrained(...).to("cuda")`) has all weights as
+CUDA device pointers. `BackendCompileArgs.device_ptrs` is populated with these GPU pointers.
+NativeRuntime has no mechanism to handle GPU-resident weight data — the `device_ptrs` map is
+simply ignored. After search completes (it can search because it uses dummy CPU data during
+profiling), the first real `execute()` call processes the graph:
+
+1. `Input` nodes are skipped (their buffers should be pre-populated by `set_input_from_ptr`)
+2. Weight `Input` nodes were set via `set_input_device_ptr` — but NativeRuntime's
+   `set_input_device_ptr` likely no-ops or stores garbage, leaving those buffers empty
+3. The `Output` node looks up its predecessor's buffer → key not found → panic
+
+### Why it was hard to find
+
+1. **Silent fallback**: `_detect_factory_capsule` catches `ImportError` without logging a
+   warning. Nothing in stdout indicates you're running on CPU when the model is on GPU.
+2. **Search succeeds**: The e-graph search runs to completion (searches 1 group, 1 chunk in
+   ~15s) because it uses 1.0f32 dummy data that doesn't need GPU. The failure only occurs at
+   first real execution.
+3. **Misleading error site**: `hlir.rs:2239` is in NativeRuntime's buffer-copy loop for Output
+   nodes — it gives no indication that the root cause is a missing CUDA feature flag at build time.
+4. **Backtrace required**: Without `RUST_BACKTRACE=1`, only the panic message is visible;
+   the `NativeRuntime` frame that reveals the CPU fallback is hidden.
+
+### The fix
+
+Rebuild the wheel with CUDA support:
+```bash
+maturin build --release --features cuda
+pip install target/wheels/luminal_python-*.whl --force-reinstall
+```
+
+Or via the test runner: `./run_tests_cuda.sh` uses `maturin develop --features cuda -r`.
+
+Consider adding an explicit warning or error in `_detect_factory_capsule` when CUDA inputs are
+detected but no CUDA factory is available:
+
+```python
+if device.type == "cuda":
+    try:
+        from .luminal import _cuda_lite_factory_capsule
+        return _cuda_lite_factory_capsule()
+    except ImportError:
+        import warnings
+        warnings.warn(
+            "CUDA inputs detected but luminal was built without --features cuda. "
+            "Falling back to NativeRuntime (CPU) — this will likely panic at runtime.",
+            RuntimeWarning,
+            stacklevel=3,
+        )
+```
+
+### The regression test
+
+`test_hf_llama3_8b_instruct_1layer` in `tests/test_llama3.py` — tests the exact architecture
+from the benchmark (Meta-Llama-3-8B-Instruct, 4096 hidden, 32 attn heads, 8 KV heads) with
+1 layer and random weights. This test passes with `--features cuda` and panics without it.
+
+### General principle
+
+**When a feature gate silently changes the runtime backend, assert that the selected backend
+is compatible with the input device.** A CUDA tensor flowing into a CPU-only runtime is always
+a programming error, not a graceful degradation. The failure should surface at factory
+selection time (with a clear error message), not deep in a Rust buffer-copy loop.
+
+---
+
 ## 2026-03-25 — KernelExp/KernelSigmoid: Fused CUDA Kernels for Precision

 1. **Symptom**: `test_hf_llama3_full` (16-layer Llama-3.2-1B) had ~1e-4 max diff vs PyTorch.
@@ -757,6 +843,44 @@ identical across all attempts (dtype issue) vs varying (actual numerical issue).
 4. **Fix**: Added `KernelExp` (uses `expf()`), `KernelSigmoid` (uses `1/(1+expf(-x))`), and Kahan summation in SumReduce. Each uses both `kernel_rewrite` and a direct egglog pattern match with range checks (e.g., `(> ?val 1.44) (< ?val 1.45)`) to bypass constant format dependency.
 5. **Principle**: When decomposed CUDA kernel chains cause precision loss, add fused kernels via `kernel_rewrite`. For robustness, add BOTH the logical-op rewrite path AND a direct HLIR pattern match — the constant format in egglog can be fragile.

+---
+
+## 2026-04-23 — NativeRuntime Multi-Call Panic: Input Buffers Cleared After Each Run
+
+1. **Symptom**: The compiled model panicked with `hlir.rs:XXXX: no entry found for key` on the second call. First call succeeded; subsequent calls failed.
+2. **Root cause**: `NativeRuntime::execute` in `src/hlir.rs` called `self.buffers.retain(|k, _| output_nodes.contains(k))` after each run to free intermediate buffers. This correctly pruned temporary buffers but also pruned the Input-node buffers that hold model weights — so on the second call, the weight tensors were gone.
+3. **Why hard**: The bug never manifested in the test suite because every test called the compiled model exactly once per compile. The issue only appeared when running a bench loop that called the model multiple times. The panic location (deep in buffer lookup) gave no indication that the root cause was in the buffer retention policy.
+4. **Fix**: Changed the retain predicate to keep both `Output` and `Input` nodes:
+   ```rust
+   let keep_nodes = graph.node_indices()
+       .filter(|n| is::<Output> || is::<Input>)
+       .collect();
+   self.buffers.retain(|k, _| keep_nodes.contains(k));
+   ```
+5. **Principle**: When buffer lifetime policies are changed to free memory after a run, always verify that *persistent* state (model weights stored in Input nodes) is excluded from the cleanup sweep. A test that compiles + calls once per test function will never catch a multi-call regression — add a dedicated multi-call test for any compiled runtime.
+
+---
+
+## 2026-04-23 — PT2 USER_INPUT_MUTATION Outputs Confuse Dynamo Caller
+
+1. **Symptom**: With `StaticCache`, the compiled model returned `[1]` (cumulative_length update) instead of `[1, vocab_size]` logits. The wrong tensor was silently mapped to the output variable.
+2. **Root cause**: When `torch.export` encounters in-place mutations to input tensors (KV cache updates via `index_copy_`), it lifts them as `USER_INPUT_MUTATION` output specs, placed *before* the actual `USER_OUTPUT` logits in `ep.graph_signature.output_specs`. The compiled model returned all outputs; dynamo mapped index 0 (the mutation) to the first return value.
+3. **Why hard**: The output shape `[1]` from `cumulative_length` looked like a valid (though wrong) output. No error was raised — just wrong logits. Required inspecting `ep.graph_signature.output_specs` and understanding the ordering convention for different `OutputKind` values.
+4. **Fix**: In `pt2_backend`, parse `output_specs` to build a `mutation_mappings` list and `user_output_indices`. Wrap the compiled model to: (a) copy mutation outputs back into the corresponding input tensors, and (b) return only the `USER_OUTPUT` tensors.
+5. **Principle**: After `torch.export(...).run_decompositions()`, always inspect `ep.graph_signature.output_specs` when the model has in-place operations (KV cache, BN running stats). The output ordering is: mutations first, then actual outputs — and the caller only expects actual outputs.
+
+---
+
+## 2026-04-23 — CUDA Version Mismatch: torch+cuXXX Must Match System Driver
+
+1. **Symptom**: `torch.cuda.is_available()` returned `False` despite `nvidia-smi` showing a GPU. Warning: "CUDA initialization: The NVIDIA driver on your system is too old (found version 12080)."
+2. **Root cause**: `torch==2.11.0+cu130` requires CUDA 13.0 which needs driver >= 575. The system has driver 570 (CUDA 12.8 max). The mismatch caused silent CPU fallback — no error, just False from `is_available()`.
+3. **Why hard**: The bench appeared to start successfully (model loaded, compilation ran) but produced no results because it was running an 8B model on CPU. Zero output with exit code 0 looked like a hang or silent crash.
+4. **Fix**: Installed `torch==2.11.0+cu128` from `https://download.pytorch.org/whl/cu128`. CUDA 12.8 matches driver 570. Also needed matching `torchvision==0.26.0+cu128` and the `nvidia-cusparselt-cu12` runtime library.
+5. **Principle**: Before running any CUDA-dependent bench or test, verify `torch.cuda.is_available()` returns `True`. Check `nvidia-smi` CUDA Version field against the `+cuXXX` suffix in `torch.__version__` — they must match (CUDA runtime ≤ driver's max supported version). Never assume CPU fallback "works" for large model benchmarks.
+
+---
+
 ## 2026-04-26 — Loop unroll-union rules silently disabled in full egglog stage

 1. **Symptom**: Python `test_llama_transformer_block` (CUDA backend) produced output ~1e-2 off from PyTorch (atol=1e-4) on the `loop_rolling` branch. All component tests (RMSNorm, attention, SwiGLU, RoPE) passed. The diff pattern was suspicious: row 0 of the (1,4,32) output matched exactly, rows 1–3 differed slightly. Disabling rolling fixed it.
@@ -767,6 +891,8 @@ identical across all attempts (dtype issue) vs varying (actual numerical issue).
 4. **Fix**: Register `binary_op_unroll_rules` in BOTH `early_rewrites()` (so fusion patterns like GLUMoE can match before the early-stage extract, which is what fixed `test_glumoe_gemma_gelu_matches_unfused_output` earlier in the session) AND `rewrites()` (so kernel-level rewrites like `direct-exp-fusion` can match in the full stage on the unrolled chain). One block per binary op (`Add`, `Mul`, `Mod`, `LessThan`).
 5. **Principle**: When egglog has multiple stages (early/full) with disjoint rule sets, any rewrite that materialises new HLIR/IR enodes (rather than just lowering to LLIR) needs to fire in BOTH stages if downstream rewrites in BOTH stages might want to see the new structure. Putting "preparatory" rewrites only in `early_rewrites` means their effect is lost across the early→full handoff. The narrow rule of thumb: if your rule's outputs are intended to enable matches by other rules, audit which stages those other rules run in and register accordingly.

+---
+
 ## 2026-04-26 — `unroll_loops_in_llir` panicked on iteration-invariant body producers

 1. **Symptom**: Modal CI/CD job for the gemma example panicked at `src/graph.rs:1867` with `no entry found for key`. The line is `clone_map[i - 1][&body_producer]` inside `unroll_loops_in_llir`'s `resolve_src` closure — `body_producer` (the LoopEnd's incoming source for that slot) wasn't a key in the per-iteration clone map. cuda_lite/python tests didn't repro: only triggered by the specific genome and graph shapes that gemma's longer search settles on.
@@ -775,6 +901,8 @@ identical across all attempts (dtype issue) vs varying (actual numerical issue).
 4. **Fix**: in `unroll_loops_in_llir::resolve_src`, when the LoopStart-resolved `body_producer` isn't in `body_nodes`, return `body_producer` itself for iter > 0 instead of indexing `clone_map[i - 1]`. The body op didn't depend on the loop variable, so every iter > 0 carries the same value forward — using `body_producer` directly is semantically correct. Mirrored the same `unwrap_or(body_producer)` fallback in the post-loop substitution map (`marker_post_sub` for LoopEnd / LoopOutputSelect). Added a backward-walk-from-end-markers backfill in `collapse_loops_to_first_iter` so its body-node iteration also covers these nodes (it doesn't have a clone_map, but does need to rewire body ops' incoming edges before deleting markers).
 5. **Principle**: When a graph-walk-derived set is used as a hashmap key requirement, every code path that *could* produce a key outside that set needs a graceful fallback — not just a defensive `expect`. For loop unrolling specifically, the rule is: `body_nodes` is the set of "ops that participate in per-iter computation"; ops on the LoopEnd's path that *don't* participate (iteration-invariant) are still legitimate, and need a "no clone, share across iters" path through `resolve_src` and `marker_post_sub`. Forward-walk-only `body_nodes` is correct only when extraction never produces iteration-invariant body producers — and in an egglog-driven search, that's not a guarantee you can make.

+---
+
 ## 2026-04-26 — Iteration-invariant state slots are a first-class concept, not a defensive fallback

 1. **Symptom + fix recap**: gemma Modal CI panicked at `clone_map[i-1][&body_producer]` because some state slots' `body_producer` (LoopEnd's incoming) isn't in `body_nodes` (forward walk from input markers). The first commit pair (16de9638 / 93fb02c4) caught this with `.unwrap_or(body_producer)` — which works but reads as "defensive, unclear *why* this case exists."
@@ -855,13 +983,70 @@ Two important details:

 Also: search-time dummy-1 inputs are not the same shape as runtime inputs. Anything you compute from a runtime tensor (cumsum offsets, routing indices, mask boundaries) needs to remain in-bounds for the dummy. Clamp index-producing chains as a matter of course, not just when the math says you "should" — `make_ones_bytes` is a hostile witness.

-## 2026-05-02 — Whisper port hit two missing-translator pitfalls
+---
+
+## 2026-05-01 — `KernelScatter` float4 vectorization wrote 2× past end of buffer for bf16/f16 KV cache
+
+### What the symptom was
+
+After the `translate_grouped_mm` gather rewrite (above) cleared the OOM, the qwen3-moe bench progressed past search but panicked during execution roughly 40% of the time:
+```
+crates/luminal_cuda_lite/src/runtime.rs:1204:
+  CUDA execute error in "CudaGraph":
+    DriverError(CUDA_ERROR_ILLEGAL_ADDRESS, "an illegal memory access was encountered")
+```
+qwen3-4b (dense) was unaffected; the bf16 KV cache in HF `StaticCache` was the only path triggering it. The rust `examples/qwen3_moe` ran fine because it uses an F32 KV cache.
+
+### What the actual root cause was
+
+`KernelScatter::compile` in `crates/luminal_cuda_lite/src/kernel/hlir.rs` emitted a hand-written CUDA copy phase that vectorised through `float4` (16-byte) reads/writes:
+
+```cuda
+long long n_vec = n_dest / 4;          // ← assumes 4-byte dtype
+float4 *out4 = (float4 *)out;
+const float4 *dest4 = (const float4 *)dest;
+for (long long i = tid; i < n_vec; i += blockDim.x) {
+    out4[i] = dest4[i];                 // ← writes 16 B per iteration
+}
+long long remainder_start = n_vec * 4;  // ← also assumes 4 elem/vec
+```
+
+For `dtype=F32` (4 bytes), `n_vec * 16 = n_dest * 4` bytes — exactly fills the buffer. For `dtype=Bf16` (2 bytes), `n_vec * 16 = (n_dest/4) * 16 = n_dest * 4` bytes, which is **2× the actual buffer size of `n_dest * 2` bytes**. The write walks half the buffer past the end of `out` (and reads past `dest`).
+
+Whether that produced an `ILLEGAL_ADDRESS` depended on whether the OOB region happened to land on an unmapped page. For different search outcomes, the surrounding allocator state differed → ~60% it was silent corruption, ~40% it crashed the CUDA context. That probabilistic mix is why the bug had been hidden — no test exercised a bf16 scatter (every existing scatter test uses F32 by default), and the rust example uses F32 KV cache so it was never seen there either.
+
+### Why it was hard to find
+
+1. **Probabilistic, but search-determinate**: the rewrite from HLIR `Scatter` → `KernelScatter` always fires (it's the only non-NoCopy path), so the kernel is always present. The crash depends on memory layout, which depends on which other kernels the search picked. Made it look like an egglog-mutation issue rather than a kernel-correctness issue.
+2. **Existing test coverage was F32-only**: `test_scatter_execution_correctness` (in `tests/consumed_buffer_tests.rs`) explicitly tries 50 random extractions to cover both `Scatter` and `ScatterNoCopy`, but always with `cx.tensor(5)` which defaults to F32. The bug would never surface there.
+3. **The panic message hid the kernel name**: it surfaced as a generic `"CudaGraph"` host-op panic — the cuda_graph_exec batches all kernels into one atomic launch, so the failing kernel disappears into the batch. To localize it I had to add a `LUMINAL_DEBUG_SEQ` env var to `CudaGraphOp::execute_internal` that bypasses graph batching and launches each kernel via `cuLaunchKernel` with a sync afterwards, surfacing kernel name + node + grid/block/pointers when one fails.
+
+### The fix
+
+Parameterise `n_vec` and the remainder-loop start by the number of dtype elements that fit in 16 bytes:
+
+```rust
+let elements_per_vec: usize = match self.dtype {
+    DType::F64 => 2,
+    DType::F32 | DType::Int => 4,
+    DType::F16 | DType::Bf16 | DType::I16 | DType::U16 => 8,
+    DType::Bool | DType::I8 | DType::U8
+        | DType::F8UE8M0 | DType::F8E4M3 | DType::F8E5M2 => 16,
+    other => panic!("Unsupported dtype for scatter vectorization: {other:?}"),
+};
+```
+and substitute `{elements_per_vec}` into the kernel template (both the `n_vec` calc and `remainder_start`). For F32 / Int the generated code is byte-for-byte identical to before, so existing F32 tests are unaffected; for any other dtype the byte coverage now exactly equals `n_dest * sizeof(dtype)` as intended.
+
+### Result
+
+Before fix: 3/5 success at iters=10 (probabilistic).
+After fix: 5/5 at iters=10, 3/3 at iters=50. All 206 HLIR tests still pass. TTFT/TPOT identical (~9.35s / ~1.17s).
+
+### General principle
+
+**Hand-rolled CUDA vectorisation with a fixed-width type (`float4`, `float2`, `int4`, …) is almost always specialised to one element size.** When the same kernel template is parameterised by `dtype`, every byte-count expression has to be too. The cheapest correct form is "elements per vector load" computed from the dtype's byte size — never hardcode `/4`.
+
+Also: **F32 is not a representative test dtype for kernels with vector loads.** When a kernel is written generic-over-dtype, the test matrix needs to actually exercise the dtypes (bf16, f16, bool) where the vector-element-count differs. A `test_scatter_bf16` would have caught this years before the qwen3-moe bench did. Same trap likely exists wherever else `float4` is cast over a `{dtype} *` template.
+
+Diagnostic also added: `LUMINAL_DEBUG_SEQ=1` on the python_luminal path will now bypass `CudaGraphOp` batching at execute time, launching each kernel sequentially with a sync afterwards. If a future ILLEGAL_ADDRESS hides inside a batched graph again, this surfaces the kernel name and node index immediately.

-1. **Symptom**: Compiling a PyTorch port of Whisper-tiny.en through `luminal_backend` failed twice in a row at the dispatch table: first with `Unsupported ATen op: torch.ops.aten.gelu.default`, then with `full: unsupported fill value type ... -Infinity`.
-2. **Root cause #1**: the dispatch table in `crates/luminal_python/rust/src/translator/dispatch.rs` mapped `sigmoid`, `tanh`, `relu` etc. but not `gelu` or `silu`. Whisper's encoder uses `F.gelu`, so the activation hit a hole.
-3. **Root cause #2**: PyTorch serializes `float("-inf")` in PT2 as the string `"-Infinity"` (and `"NaN"`/`"Infinity"` analogously). `translate_full`'s `get_float_arg` only accepts numeric float/int payloads, so any `torch.full((..), -inf)` (the obvious way to write a causal mask) blows up. Decoder mask code is the most common spot.
-4. **Why it was tricky**: both errors arrive from inside `pt2_backend` with a stack trace that ends in `process_pt2`, hiding the actual ATen target inside the message. You only see the offending op name in the error string itself, so you have to read `RuntimeError: Failed to translate node N: …` carefully and grep `dispatch.rs` for it.
-5. **Fix in this session**:
-   - Added `aten.gelu.default → a.gelu()` and `aten.silu.default → a.silu()` to `dispatch.rs`.
-   - Worked around the `-Infinity` issue at the model level by using a finite `-1e10` for the causal mask in the example (matches the Rust example's convention). The cleaner fix (parsing `"-Infinity"`/`"Infinity"`/`"NaN"` strings in `get_float_arg` / `translate_full`) is left for a follow-up.
-6. **Principle**: when adding a new model that goes through the PT2 backend, expect to plug small holes in `dispatch.rs` and `translator/tensor.rs::translate_full`. The trace points at the python frame, not the Rust dispatch arm — open `dispatch.rs`, ctrl-F the offending op name, and add the one-liner. For float-shaped sentinel values (`-inf`, `inf`, `nan`), the export pipeline currently only accepts finite floats; either rewrite the model or extend the parser.
--- a/crates/luminal_python/README.md
+++ b/crates/luminal_python/README.md
@@ -1,60 +0,0 @@
-# luminal_python
-
-PyTorch `torch.compile` integration for Luminal.
-
-## CUDA Tests
-
-The Python CUDA CI job builds the Rust extension with the CUDA feature and runs
-the non-slow pytest suite:
-
-```bash
-cd crates/luminal_python
-RUST_BACKTRACE=1 \
-LUMINAL_TEST_DEVICE=cuda \
-MATURIN_PEP517_ARGS="--features cuda --profile release" \
-CUDARC_CUDA_VERSION=12080 \
-uv run --group dev python -m pytest tests/ -v -s -m "not slow"
-```
-
-The slow tests are explicit opt-in. They include large/pretrained model tests,
-full-width architecture compiles, Whisper end-to-end cases, and other cases that
-can take a long time or need a large GPU / Hugging Face cache.
-
-Run the full Python CUDA suite, including slow tests:
-
-```bash
-cd crates/luminal_python
-RUST_BACKTRACE=1 \
-LUMINAL_TEST_DEVICE=cuda \
-MATURIN_PEP517_ARGS="--features cuda --profile release" \
-CUDARC_CUDA_VERSION=12080 \
-uv run --group dev python -m pytest tests/ -v -s
-```
-
-Run only the slow Python CUDA tests:
-
-```bash
-cd crates/luminal_python
-RUST_BACKTRACE=1 \
-LUMINAL_TEST_DEVICE=cuda \
-MATURIN_PEP517_ARGS="--features cuda --profile release" \
-CUDARC_CUDA_VERSION=12080 \
-uv run --group dev python -m pytest tests/ -v -s -m slow
-```
-
-The helper script follows the same convention:
-
-```bash
-cd crates/luminal_python
-./run_tests_cuda.sh              # non-slow CUDA suite
-./run_tests_cuda.sh --slow-only  # only slow CUDA tests
-./run_tests_cuda.sh --include-slow
-```
-
-The GitHub/Modal entrypoint uses the same marker split:
-
-```bash
-cd crates/luminal_python
-modal run modal_pytest_runner.py --gpu A100 --timeout 7200 tests/ -v -s -m "not slow"
-modal run modal_pytest_runner.py --gpu A100 --timeout 7200 tests/ -v -s
-```
--- a/crates/luminal_python/examples/whisper.py
+++ b/crates/luminal_python/examples/whisper.py
@@ -1,497 +0,0 @@
-"""Whisper transcription demo using the luminal torch.compile backend.
-
-Implements a small PyTorch port of ``openai/whisper-tiny.en`` that mirrors the
-luminal Rust example (``examples/whisper`` in the workspace), loads the official
-HuggingFace weights, and runs greedy decoding through the luminal backend via
-``torch.compile``.
-
-Usage::
-
-    uv run python examples/whisper.py [path/to/audio.wav]
-
-If no path is provided, falls back to the JFK sample bundled with the Rust
-``examples/whisper`` crate.
-"""
-
-from __future__ import annotations
-
-import os
-import sys
-import time
-import wave
-from pathlib import Path
-from typing import Optional
-
-import numpy as np
-import torch
-import torch._dynamo
-import torch.nn.functional as F
-from transformers import (
-    WhisperFeatureExtractor,
-    WhisperForConditionalGeneration,
-    WhisperTokenizer,
-)
-
-from luminal.pt2 import compile as luminal_compile
-
-REPO_ID = "openai/whisper-tiny.en"
-
-# whisper-tiny.en hyperparameters
-N_MELS = 80
-N_AUDIO_CTX = 1500
-D_MODEL = 384
-N_HEADS = 6
-HEAD_DIM = D_MODEL // N_HEADS
-N_AUDIO_LAYER = 4
-N_TEXT_LAYER = 4
-N_TEXT_CTX = 448
-FF_DIM = 4 * D_MODEL
-N_VOCAB = 51864
-LAYER_NORM_EPS = 1e-5
-
-# Decoder special tokens
-TOKEN_SOT = 50257
-TOKEN_NO_TIMESTAMPS = 50362
-TOKEN_EOT = 50256
-
-
-# ---------------------------------------------------------------------------
-# Model — mirrors the HLIR encoder/decoder in examples/whisper/src/model.rs
-# ---------------------------------------------------------------------------
-
-
-class WhisperAttention(torch.nn.Module):
-    """Multi-head attention with separate q/k/v projections (no bias on k_proj)."""
-
-    def __init__(self, d_model: int = D_MODEL, n_heads: int = N_HEADS):
-        super().__init__()
-        self.n_heads = n_heads
-        self.head_dim = d_model // n_heads
-        self.q_proj = torch.nn.Linear(d_model, d_model, bias=True)
-        self.k_proj = torch.nn.Linear(d_model, d_model, bias=False)
-        self.v_proj = torch.nn.Linear(d_model, d_model, bias=True)
-        self.out_proj = torch.nn.Linear(d_model, d_model, bias=True)
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        kv_input: Optional[torch.Tensor] = None,
-        causal: bool = False,
-    ) -> torch.Tensor:
-        # x: (seq, d_model). kv_input is None → self-attn; otherwise cross-attn.
-        kv = x if kv_input is None else kv_input
-        q = self.q_proj(x)
-        k = self.k_proj(kv)
-        v = self.v_proj(kv)
-
-        seq_q = q.shape[0]
-        seq_kv = k.shape[0]
-
-        # (seq, d_model) -> (n_heads, seq, head_dim)
-        q = q.reshape(seq_q, self.n_heads, self.head_dim).transpose(0, 1)
-        k = k.reshape(seq_kv, self.n_heads, self.head_dim).transpose(0, 1)
-        v = v.reshape(seq_kv, self.n_heads, self.head_dim).transpose(0, 1)
-
-        scale = 1.0 / (self.head_dim**0.5)
-        scores = torch.matmul(q, k.transpose(-2, -1)) * scale  # (h, sq, sk)
-        if causal:
-            # Use a large finite negative instead of -inf so the export pipeline
-            # serializes a float instead of the unsupported "-Infinity" sentinel.
-            mask = torch.triu(
-                torch.full((seq_q, seq_kv), -1e10, device=x.device),
-                diagonal=1,
-            )
-            scores = scores + mask
-        weights = torch.softmax(scores, dim=-1)
-        attn = torch.matmul(weights, v)  # (h, sq, hd)
-        merged = attn.transpose(0, 1).reshape(seq_q, -1)
-        return self.out_proj(merged)
-
-
-class EncoderLayer(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.self_attn = WhisperAttention()
-        self.self_attn_layer_norm = torch.nn.LayerNorm(D_MODEL, eps=LAYER_NORM_EPS)
-        self.fc1 = torch.nn.Linear(D_MODEL, FF_DIM, bias=True)
-        self.fc2 = torch.nn.Linear(FF_DIM, D_MODEL, bias=True)
-        self.final_layer_norm = torch.nn.LayerNorm(D_MODEL, eps=LAYER_NORM_EPS)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = x + self.self_attn(self.self_attn_layer_norm(x))
-        h = self.final_layer_norm(x)
-        h = F.gelu(self.fc1(h))
-        h = self.fc2(h)
-        return x + h
-
-
-class WhisperEncoder(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.conv1 = torch.nn.Conv1d(
-            N_MELS, D_MODEL, kernel_size=3, padding=1, bias=True
-        )
-        self.conv2 = torch.nn.Conv1d(
-            D_MODEL, D_MODEL, kernel_size=3, stride=2, padding=1, bias=True
-        )
-        # Position embedding stored as a regular parameter (matches HF layout).
-        self.embed_positions = torch.nn.Embedding(N_AUDIO_CTX, D_MODEL)
-        self.layers = torch.nn.ModuleList(
-            [EncoderLayer() for _ in range(N_AUDIO_LAYER)]
-        )
-        self.layer_norm = torch.nn.LayerNorm(D_MODEL, eps=LAYER_NORM_EPS)
-
-    def forward(self, mel: torch.Tensor) -> torch.Tensor:
-        # mel: (n_mels, 3000) -> add batch dim for conv1d
-        x = mel.unsqueeze(0)
-        x = F.gelu(self.conv1(x))
-        x = F.gelu(self.conv2(x))
-        # (1, d_model, 1500) -> (1500, d_model)
-        x = x.squeeze(0).transpose(0, 1)
-        x = x + self.embed_positions.weight
-        for layer in self.layers:
-            x = layer(x)
-        return self.layer_norm(x)
-
-
-class DecoderLayer(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.self_attn = WhisperAttention()
-        self.self_attn_layer_norm = torch.nn.LayerNorm(D_MODEL, eps=LAYER_NORM_EPS)
-        self.encoder_attn = WhisperAttention()
-        self.encoder_attn_layer_norm = torch.nn.LayerNorm(D_MODEL, eps=LAYER_NORM_EPS)
-        self.fc1 = torch.nn.Linear(D_MODEL, FF_DIM, bias=True)
-        self.fc2 = torch.nn.Linear(FF_DIM, D_MODEL, bias=True)
-        self.final_layer_norm = torch.nn.LayerNorm(D_MODEL, eps=LAYER_NORM_EPS)
-
-    def forward(self, x: torch.Tensor, xa: torch.Tensor) -> torch.Tensor:
-        x = x + self.self_attn(self.self_attn_layer_norm(x), causal=True)
-        x = x + self.encoder_attn(self.encoder_attn_layer_norm(x), kv_input=xa)
-        h = self.final_layer_norm(x)
-        h = F.gelu(self.fc1(h))
-        h = self.fc2(h)
-        return x + h
-
-
-class WhisperDecoder(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.embed_tokens = torch.nn.Embedding(N_VOCAB, D_MODEL)
-        self.embed_positions = torch.nn.Embedding(N_TEXT_CTX, D_MODEL)
-        self.layers = torch.nn.ModuleList([DecoderLayer() for _ in range(N_TEXT_LAYER)])
-        self.layer_norm = torch.nn.LayerNorm(D_MODEL, eps=LAYER_NORM_EPS)
-
-    def forward(self, tokens: torch.Tensor, xa: torch.Tensor) -> torch.Tensor:
-        # tokens: (seq,) of int64 — absolute positions are 0..seq-1
-        seq = tokens.shape[0]
-        pos = torch.arange(seq, dtype=torch.long, device=tokens.device)
-        x = self.embed_tokens(tokens) + self.embed_positions(pos)
-        for layer in self.layers:
-            x = layer(x, xa)
-        x = self.layer_norm(x)
-        # Tied projection
-        return torch.matmul(x, self.embed_tokens.weight.transpose(0, 1))
-
-
-class Whisper(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.encoder = WhisperEncoder()
-        self.decoder = WhisperDecoder()
-
-    def forward(self, mel: torch.Tensor, tokens: torch.Tensor) -> torch.Tensor:
-        xa = self.encoder(mel)
-        return self.decoder(tokens, xa)
-
-
-class DecoderWithFixedXa(torch.nn.Module):
-    """Wraps the decoder with the encoder output stored as a buffer.
-
-    The audio is fixed for the whole utterance, so ``xa`` is a constant relative
-    to the per-token decode loop. Storing it as a buffer lets us compile the
-    decoder once with a single dynamic-length ``tokens`` input, avoiding a full
-    recompilation at every step as the sequence grows.
-    """
-
-    def __init__(self, decoder: WhisperDecoder, xa: torch.Tensor):
-        super().__init__()
-        self.decoder = decoder
-        self.register_buffer("xa", xa)
-
-    def forward(self, tokens: torch.Tensor) -> torch.Tensor:
-        return self.decoder(tokens, self.xa)
-
-
-# ---------------------------------------------------------------------------
-# Weight loading: HF state_dict -> our model
-# ---------------------------------------------------------------------------
-
-
-def load_hf_weights_into(model: Whisper) -> None:
-    """Copy HF whisper-tiny.en weights into our matching modules."""
-    hf = WhisperForConditionalGeneration.from_pretrained(REPO_ID).eval()
-    sd = hf.state_dict()
-
-    def get(name: str) -> torch.Tensor:
-        return sd[f"model.{name}"].clone()
-
-    enc = model.encoder
-    enc.conv1.weight.data.copy_(get("encoder.conv1.weight"))
-    enc.conv1.bias.data.copy_(get("encoder.conv1.bias"))
-    enc.conv2.weight.data.copy_(get("encoder.conv2.weight"))
-    enc.conv2.bias.data.copy_(get("encoder.conv2.bias"))
-    enc.embed_positions.weight.data.copy_(get("encoder.embed_positions.weight"))
-    enc.layer_norm.weight.data.copy_(get("encoder.layer_norm.weight"))
-    enc.layer_norm.bias.data.copy_(get("encoder.layer_norm.bias"))
-    for i, layer in enumerate(enc.layers):
-        prefix = f"encoder.layers.{i}"
-        layer.self_attn.q_proj.weight.data.copy_(
-            get(f"{prefix}.self_attn.q_proj.weight")
-        )
-        layer.self_attn.q_proj.bias.data.copy_(get(f"{prefix}.self_attn.q_proj.bias"))
-        layer.self_attn.k_proj.weight.data.copy_(
-            get(f"{prefix}.self_attn.k_proj.weight")
-        )
-        layer.self_attn.v_proj.weight.data.copy_(
-            get(f"{prefix}.self_attn.v_proj.weight")
-        )
-        layer.self_attn.v_proj.bias.data.copy_(get(f"{prefix}.self_attn.v_proj.bias"))
-        layer.self_attn.out_proj.weight.data.copy_(
-            get(f"{prefix}.self_attn.out_proj.weight")
-        )
-        layer.self_attn.out_proj.bias.data.copy_(
-            get(f"{prefix}.self_attn.out_proj.bias")
-        )
-        layer.self_attn_layer_norm.weight.data.copy_(
-            get(f"{prefix}.self_attn_layer_norm.weight")
-        )
-        layer.self_attn_layer_norm.bias.data.copy_(
-            get(f"{prefix}.self_attn_layer_norm.bias")
-        )
-        layer.fc1.weight.data.copy_(get(f"{prefix}.fc1.weight"))
-        layer.fc1.bias.data.copy_(get(f"{prefix}.fc1.bias"))
-        layer.fc2.weight.data.copy_(get(f"{prefix}.fc2.weight"))
-        layer.fc2.bias.data.copy_(get(f"{prefix}.fc2.bias"))
-        layer.final_layer_norm.weight.data.copy_(
-            get(f"{prefix}.final_layer_norm.weight")
-        )
-        layer.final_layer_norm.bias.data.copy_(get(f"{prefix}.final_layer_norm.bias"))
-
-    dec = model.decoder
-    dec.embed_tokens.weight.data.copy_(get("decoder.embed_tokens.weight"))
-    dec.embed_positions.weight.data.copy_(get("decoder.embed_positions.weight"))
-    dec.layer_norm.weight.data.copy_(get("decoder.layer_norm.weight"))
-    dec.layer_norm.bias.data.copy_(get("decoder.layer_norm.bias"))
-    for i, layer in enumerate(dec.layers):
-        prefix = f"decoder.layers.{i}"
-        layer.self_attn.q_proj.weight.data.copy_(
-            get(f"{prefix}.self_attn.q_proj.weight")
-        )
-        layer.self_attn.q_proj.bias.data.copy_(get(f"{prefix}.self_attn.q_proj.bias"))
-        layer.self_attn.k_proj.weight.data.copy_(
-            get(f"{prefix}.self_attn.k_proj.weight")
-        )
-        layer.self_attn.v_proj.weight.data.copy_(
-            get(f"{prefix}.self_attn.v_proj.weight")
-        )
-        layer.self_attn.v_proj.bias.data.copy_(get(f"{prefix}.self_attn.v_proj.bias"))
-        layer.self_attn.out_proj.weight.data.copy_(
-            get(f"{prefix}.self_attn.out_proj.weight")
-        )
-        layer.self_attn.out_proj.bias.data.copy_(
-            get(f"{prefix}.self_attn.out_proj.bias")
-        )
-        layer.self_attn_layer_norm.weight.data.copy_(
-            get(f"{prefix}.self_attn_layer_norm.weight")
-        )
-        layer.self_attn_layer_norm.bias.data.copy_(
-            get(f"{prefix}.self_attn_layer_norm.bias")
-        )
-        layer.encoder_attn.q_proj.weight.data.copy_(
-            get(f"{prefix}.encoder_attn.q_proj.weight")
-        )
-        layer.encoder_attn.q_proj.bias.data.copy_(
-            get(f"{prefix}.encoder_attn.q_proj.bias")
-        )
-        layer.encoder_attn.k_proj.weight.data.copy_(
-            get(f"{prefix}.encoder_attn.k_proj.weight")
-        )
-        layer.encoder_attn.v_proj.weight.data.copy_(
-            get(f"{prefix}.encoder_attn.v_proj.weight")
-        )
-        layer.encoder_attn.v_proj.bias.data.copy_(
-            get(f"{prefix}.encoder_attn.v_proj.bias")
-        )
-        layer.encoder_attn.out_proj.weight.data.copy_(
-            get(f"{prefix}.encoder_attn.out_proj.weight")
-        )
-        layer.encoder_attn.out_proj.bias.data.copy_(
-            get(f"{prefix}.encoder_attn.out_proj.bias")
-        )
-        layer.encoder_attn_layer_norm.weight.data.copy_(
-            get(f"{prefix}.encoder_attn_layer_norm.weight")
-        )
-        layer.encoder_attn_layer_norm.bias.data.copy_(
-            get(f"{prefix}.encoder_attn_layer_norm.bias")
-        )
-        layer.fc1.weight.data.copy_(get(f"{prefix}.fc1.weight"))
-        layer.fc1.bias.data.copy_(get(f"{prefix}.fc1.bias"))
-        layer.fc2.weight.data.copy_(get(f"{prefix}.fc2.weight"))
-        layer.fc2.bias.data.copy_(get(f"{prefix}.fc2.bias"))
-        layer.final_layer_norm.weight.data.copy_(
-            get(f"{prefix}.final_layer_norm.weight")
-        )
-        layer.final_layer_norm.bias.data.copy_(get(f"{prefix}.final_layer_norm.bias"))
-
-
-# ---------------------------------------------------------------------------
-# Audio loading + decoding
-# ---------------------------------------------------------------------------
-
-
-def load_wav_16k_mono(path: Path) -> np.ndarray:
-    with wave.open(str(path), "rb") as w:
-        sr = w.getframerate()
-        n = w.getnframes()
-        ch = w.getnchannels()
-        sw = w.getsampwidth()
-        raw = w.readframes(n)
-
-    if sw == 2:
-        samples = np.frombuffer(raw, dtype=np.int16).astype(np.float32) / 32768.0
-    elif sw == 4:
-        samples = np.frombuffer(raw, dtype=np.int32).astype(np.float32) / 2147483648.0
-    elif sw == 1:
-        samples = (
-            np.frombuffer(raw, dtype=np.uint8).astype(np.float32) - 128.0
-        ) / 128.0
-    else:
-        raise ValueError(f"unsupported sample width {sw}")
-
-    if ch > 1:
-        samples = samples.reshape(-1, ch).mean(axis=1)
-
-    if sr != 16000:
-        ratio = sr / 16000
-        out_len = int(len(samples) / ratio)
-        idx = np.arange(out_len, dtype=np.float64) * ratio
-        lo = idx.astype(np.int64)
-        frac = (idx - lo).astype(np.float32)
-        hi = np.clip(lo + 1, 0, len(samples) - 1)
-        samples = samples[lo] * (1.0 - frac) + samples[hi] * frac
-
-    return samples.astype(np.float32)
-
-
-def greedy_decode(logits_row: torch.Tensor, suppress_first_eot: bool) -> int:
-    masked = logits_row.clone()
-    masked[TOKEN_SOT:] = float("-inf")
-    if suppress_first_eot:
-        masked[TOKEN_EOT] = float("-inf")
-    return int(torch.argmax(masked).item())
-
-
-def find_default_audio() -> Optional[Path]:
-    here = Path(__file__).resolve()
-    workspace_root = here.parents[3]
-    candidate = workspace_root / "examples" / "whisper" / "assets" / "jfk.wav"
-    return candidate if candidate.exists() else None
-
-
-def main() -> None:
-    audio_arg = sys.argv[1] if len(sys.argv) > 1 else None
-    if audio_arg:
-        audio_path = Path(audio_arg)
-    else:
-        audio_path = find_default_audio()
-        if audio_path is None:
-            print(
-                "error: no audio file given and bundled jfk.wav not found",
-                file=sys.stderr,
-            )
-            sys.exit(1)
-
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    print(f"Using device: {device}")
-
-    print("Loading audio:", audio_path)
-    audio = load_wav_16k_mono(audio_path)
-
-    print("Computing log-mel features...")
-    feature_extractor = WhisperFeatureExtractor.from_pretrained(REPO_ID)
-    features = feature_extractor(audio, sampling_rate=16000, return_tensors="pt")
-    mel: torch.Tensor = features.input_features[0].to(device)  # (80, 3000)
-    assert mel.shape == (N_MELS, 3000), mel.shape
-
-    print("Building model and loading weights...")
-    model = Whisper().eval().to(device)
-    load_hf_weights_into(model)
-    model = model.to(device)
-    tokenizer = WhisperTokenizer.from_pretrained(REPO_ID)
-
-    use_compiled = os.environ.get("LUMINAL_DISABLE", "0") != "1"
-    max_new_tokens = 100
-    search_iters = int(os.environ.get("SEARCH_ITERATIONS", "10"))
-
-    if use_compiled:
-        # 1. Run the encoder once eagerly. The audio doesn't change during decode,
-        #    so xa is a constant input to the decoder.
-        with torch.no_grad():
-            xa = model.encoder(mel)
-
-        # 2. Wrap the decoder so its only varying input is `tokens`, then compile
-        #    once with a dynamic length dim. Subsequent calls reuse the same
-        #    compiled graph — no recompile per token.
-        decoder_only = DecoderWithFixedXa(model.decoder, xa).eval().to(device)
-        example_tokens = torch.tensor(
-            [TOKEN_SOT, TOKEN_NO_TIMESTAMPS], dtype=torch.long, device=device
-        )
-        print(
-            f"Compiling decoder with dynamic seq dim (search_iters={search_iters})..."
-        )
-        compile_start = time.time()
-        compiled_decoder = luminal_compile(
-            decoder_only,
-            example_tokens,
-            search_iterations=search_iters,
-            dynamic_dim=0,
-        )
-        print(f"Compiled in {time.time() - compile_start:.1f}s")
-
-        def step_logits(decoder_input_ids: torch.Tensor) -> torch.Tensor:
-            out = compiled_decoder(decoder_input_ids)
-            return out[0] if isinstance(out, tuple) else out
-    else:
-
-        def step_logits(decoder_input_ids: torch.Tensor) -> torch.Tensor:
-            return model(mel, decoder_input_ids)
-
-    tokens = [TOKEN_SOT, TOKEN_NO_TIMESTAMPS]
-
-    print("Transcribing", end="", flush=True)
-    decode_start = time.time()
-    for step in range(max_new_tokens):
-        decoder_input_ids = torch.tensor(tokens, dtype=torch.long, device=device)
-        with torch.no_grad():
-            logits = step_logits(decoder_input_ids)
-
-        next_token = greedy_decode(logits[-1], suppress_first_eot=(step == 0))
-        if next_token == TOKEN_EOT:
-            break
-        tokens.append(next_token)
-        piece = tokenizer.decode([next_token], skip_special_tokens=False)
-        print(piece, end="", flush=True)
-    elapsed = time.time() - decode_start
-    print()
-
-    transcription = tokenizer.decode(tokens[2:], skip_special_tokens=True)
-    print(f"\nFinal transcription: {transcription}")
-    print(
-        f"Generated {len(tokens) - 2} tokens in {elapsed:.2f}s "
-        f"({(len(tokens) - 2) / max(elapsed, 1e-6):.1f} tok/s)"
-    )
-
-
-if __name__ == "__main__":
-    main()
--- a/crates/luminal_python/modal_pytest_runner.py
+++ b/crates/luminal_python/modal_pytest_runner.py
@@ -22,7 +22,7 @@ from modal.volume import FileEntryType

 app = modal.App("luminal-tests")

-DEFAULT_TIMEOUT = 2 * 60 * 60
+DEFAULT_TIMEOUT = 30 * 60
 CUDARC_CUDA_VERSION = "12080"
 LOCAL_PROJECT_DIR = Path(__file__).resolve().parent
 PROJECT_DIR = "/root/luminal/crates/luminal_python"
@@ -168,37 +168,6 @@ def _cleanup_remote_profile_artifacts(run_id: str) -> None:
        return


-def _build_cuda_extension(env: dict[str, str]) -> None:
-    cmd = [
-        "uv",
-        "run",
-        "--project",
-        PROJECT_DIR,
-        "--group",
-        "dev",
-        "maturin",
-        "develop",
-        "--manifest-path",
-        f"{PROJECT_DIR}/rust/Cargo.toml",
-        "--features",
-        "cuda",
-        "--profile",
-        "release",
-    ]
-    subprocess.run(cmd, env=env, cwd=PROJECT_DIR, check=True)
-
-
-def _effective_timeout(timeout: int) -> int:
-    if os.environ.get("GITHUB_ACTIONS") == "true" and timeout < DEFAULT_TIMEOUT:
-        print(
-            f"Using Modal timeout {DEFAULT_TIMEOUT}s instead of requested "
-            f"{timeout}s in GitHub Actions.",
-            file=sys.stderr,
-        )
-        return DEFAULT_TIMEOUT
-    return timeout
-
-
@app.cls(image=image, timeout=DEFAULT_TIMEOUT)
 class TestRunner:
    @modal.method()
@@ -225,8 +194,6 @@ class TestRunner:
        if pytest_addopts:
            env["PYTEST_ADDOPTS"] = pytest_addopts

-        _build_cuda_extension(env)
-
        original_svg_requested = _has_pytest_flag(pytest_args, "--profile-svg")
        dot_available = shutil.which("dot") is not None
        sanitized_pytest_args = [
@@ -251,6 +218,8 @@ class TestRunner:
            PROJECT_DIR,
            "--group",
            "dev",
+            "--reinstall-package",
+            "luminal_python",
            "python",
            "-m",
            "pytest",
@@ -316,7 +285,7 @@ class TestRunner:

 def _parse_cli_args(
    cli_args: tuple[str, ...],
-) -> tuple[str, int, bool, str | None, list[str]]:
+) -> tuple[str, int | None, bool, str | None, list[str]]:
    parser = argparse.ArgumentParser(
        prog="modal run modal_pytest_runner.py",
        add_help=False,
@@ -331,8 +300,7 @@ def _parse_cli_args(
    parser.add_argument(
        "--timeout",
        type=int,
-        default=DEFAULT_TIMEOUT,
-        help="Modal execution timeout in seconds. Defaults to %(default)s seconds.",
+        help="Optional Modal execution timeout in seconds. Defaults to 1800 seconds.",
    )
    parser.add_argument(
        "--profile",
@@ -366,11 +334,11 @@ def main(*cli_args: str):
    )
    profile_enabled = _profiling_enabled(cli_profile, pytest_args)
    pytest_addopts = os.environ.get("PYTEST_ADDOPTS", "")
-    timeout = _effective_timeout(timeout)
    runner_options = {"gpu": gpu}
    hf_token_secret = _hf_token_secret()
    runner_volumes = {HF_CACHE_PATH: HF_CACHE_VOLUME}
-    runner_options["timeout"] = timeout
+    if timeout is not None:
+        runner_options["timeout"] = timeout
    if profile_enabled:
        runner_volumes[PROFILE_VOLUME_PATH] = PROFILE_VOLUME
    runner_options["volumes"] = runner_volumes
--- a/crates/luminal_python/pyproject.toml
+++ b/crates/luminal_python/pyproject.toml
@@ -32,7 +32,7 @@ module-name = "luminal.luminal"

 [tool.pytest.ini_options]
 markers = [
-    "slow: tests that download large models, compile full-width model graphs, fuzz many CUDA search choices, or otherwise require explicit opt-in",
+    "slow: tests that download large models or require pre-generated artifacts",
 ]

 [dependency-groups]
@@ -46,4 +46,5 @@ dev = [
    "transformers>=4.40.0",
    "diffusers>=0.35.0",
    "modal>=1.3.5",
+    "matplotlib>=3.8",
 ]
--- a/crates/luminal_python/run_all_tests.sh
+++ b/crates/luminal_python/run_all_tests.sh
@@ -1,43 +1,34 @@
 #!/bin/bash
 set -e

-export CUDARC_CUDA_VERSION="${CUDARC_CUDA_VERSION:-12080}"
-export MATURIN_PEP517_ARGS="${MATURIN_PEP517_ARGS:---features cuda --profile release}"
-
 echo "=========================================="
 echo "  Luminal Python: Full Test Suite"
 echo "=========================================="

 NATIVE_TESTS="tests/test_hlir_ops.py tests/test_unary.py"
-CUDA_TESTS="tests/"
+CUDA_TESTS="tests/test_hlir_ops.py tests/test_unary.py tests/test_llama3.py"

 # ── Phase 1: Native Backend ─────────────────────────────────

 echo ""
 echo "=== Phase 1: Building native backend ==="
 rm -rf rust/target/wheels rust/target/debug rust/target/release
-uv run --group dev maturin develop --manifest-path rust/Cargo.toml
+uv run maturin develop --manifest-path rust/Cargo.toml

 echo ""
 echo "--- 1a: Native backend tests ---"
-uv run --group dev pytest $NATIVE_TESTS -v
+uv run pytest $NATIVE_TESTS -v

 # ── Phase 2: CUDA Backend ───────────────────────────────────

 echo ""
 echo "=== Phase 2: Building CUDA backend ==="
 rm -rf rust/target/wheels rust/target/debug rust/target/release
-uv run --group dev maturin develop --manifest-path rust/Cargo.toml --features cuda -r
+uv run maturin develop --manifest-path rust/Cargo.toml --features cuda -r

 echo ""
 echo "--- 2a: CUDA ---"
-RUST_BACKTRACE=1 LUMINAL_TEST_DEVICE=cuda uv run --group dev pytest $CUDA_TESTS -m "not slow" -v
-
-echo ""
-echo "Slow CUDA tests are opt-in. To include them, run:"
-echo "  RUST_BACKTRACE=1 LUMINAL_TEST_DEVICE=cuda uv run pytest tests/ -v -s"
-echo "Or, for only slow tests:"
-echo "  RUST_BACKTRACE=1 LUMINAL_TEST_DEVICE=cuda uv run pytest tests/ -m slow -v -s"
+RUST_BACKTRACE=1 LUMINAL_TEST_DEVICE=cuda uv run pytest $CUDA_TESTS -m "not slow" -v

 echo ""
 echo "=========================================="
--- a/crates/luminal_python/run_tests_cuda.sh
+++ b/crates/luminal_python/run_tests_cuda.sh
@@ -4,34 +4,17 @@ set -e
 echo "=== Luminal Python Test Runner (CUDA Backend) ==="
 echo ""

-export CUDARC_CUDA_VERSION="${CUDARC_CUDA_VERSION:-12080}"
-export MATURIN_PEP517_ARGS="${MATURIN_PEP517_ARGS:---features cuda --profile release}"
-
-PYTEST_MARK='not slow'
-if [[ "${1:-}" == "--include-slow" ]]; then
-    PYTEST_MARK=''
-elif [[ "${1:-}" == "--slow-only" ]]; then
-    PYTEST_MARK='slow'
-elif [[ "${1:-}" != "" ]]; then
-    echo "Usage: ./run_tests_cuda.sh [--include-slow|--slow-only]"
-    exit 2
-fi
-
 # Force clean rebuild of Rust extension
 echo "Step 1: Cleaning previous builds..."
 rm -rf rust/target/wheels rust/target/debug rust/target/release

 # Rebuild in development mode (faster compilation)
 echo "Step 2: Building Rust extension..."
-uv run --group dev maturin develop --manifest-path rust/Cargo.toml --features cuda -r
+uv run maturin develop --manifest-path rust/Cargo.toml --features cuda -r

 # Run pytest with CUDA backend
 echo "Step 3: Running pytest with CUDA backend..."
-if [[ -n "$PYTEST_MARK" ]]; then
-    RUST_BACKTRACE=1 LUMINAL_TEST_DEVICE=cuda uv run --group dev pytest tests/ -m "$PYTEST_MARK" -v -s
-else
-    RUST_BACKTRACE=1 LUMINAL_TEST_DEVICE=cuda uv run --group dev pytest tests/ -v -s
-fi
+RUST_BACKTRACE=1 LUMINAL_TEST_DEVICE=cuda uv run pytest tests/test_llama3.py tests/test_hlir_ops.py tests/test_unary.py -v

 echo ""
 echo "=== Tests Complete ==="
--- a/crates/luminal_python/rust/src/compiled_graph.rs
+++ b/crates/luminal_python/rust/src/compiled_graph.rs
@@ -12,67 +12,6 @@ use crate::typed_data::TypedData;
 /// Maps symbolic dimension parameter names (e.g. "seq_len") to luminal Expression variable chars.
 pub type DimParamMap = HashMap<String, char>;

-/// Recover a single-variable dim's variable value from an observed runtime size.
-///
-/// Returns `Some((var, value))` when the expression contains exactly one
-/// variable, is affine in that variable, and `value` round-trips through
-/// `exec_single_var_checked` to reproduce `dim_val`. Returns `None` otherwise
-/// — multi-variable expressions, non-affine forms, slope==0, and inversions
-/// that don't divide cleanly are all rejected so we never write a wrong
-/// guess into `dyn_map`.
-fn solve_single_var_dim(expr: &Expression, dim_val: usize) -> Option<(char, usize)> {
-    use luminal::shape::Term;
-    let terms = expr.terms.read();
-
-    // Identify the unique variable, if any.
-    let mut var: Option<char> = None;
-    for t in terms.iter() {
-        if let Term::Var(c) = t {
-            match var {
-                None => var = Some(*c),
-                Some(existing) if existing == *c => {}
-                Some(_) => return None, // multi-var — bail out
-            }
-        }
-    }
-    let var = var?;
-
-    // Bare-var fast path — terms is exactly `[Var]`.
-    if terms.len() == 1 {
-        return Some((var, dim_val));
-    }
-
-    // Probe two points to recover slope/intercept of an assumed affine form
-    // `f(x) = slope*x + intercept`. We use 2 and 3 (luminal's default
-    // dynamic-dim min is 2, and 3 keeps the inputs small in case the
-    // expression includes a multiplication that could overflow at scale).
-    drop(terms);
-    let f2 = expr.exec_single_var_checked(2)? as i64;
-    let f3 = expr.exec_single_var_checked(3)? as i64;
-    let slope = f3 - f2;
-    if slope == 0 {
-        return None;
-    }
-    let intercept = f2 - 2 * slope;
-    let target = dim_val as i64 - intercept;
-    if slope == 0 || target % slope != 0 {
-        return None;
-    }
-    let candidate = target / slope;
-    if candidate < 0 {
-        return None;
-    }
-    let candidate = candidate as usize;
-
-    // Verify by re-evaluating with the candidate value. Catches non-affine
-    // forms whose probe points happen to be collinear (e.g. `min(s, 100)`
-    // would look affine for s ∈ {2, 3} but flatten beyond 100).
-    if expr.exec_single_var_checked(candidate)? != dim_val {
-        return None;
-    }
-    Some((var, candidate))
-}
-
 /// Convert luminal DType to PT2 dtype integer code (for python interop)
 /// Types without a direct Pytorch equivalent map to the closest safe representation
 fn luminal_dtype_to_pt2_code(dtype: DType) -> u32 {
@@ -98,12 +37,7 @@ pub struct GraphTranslation {
    pub input_names: Vec<String>,
    pub output_names: Vec<String>,
    pub output_shape_exprs: Vec<Vec<Expression>>,
-    /// Output dtypes as PT2 dtype codes (e.g. 5 = int64, 7 = float32).
-    /// Stored as PT2 codes (rather than luminal `DType`) so we can preserve
-    /// distinctions luminal collapses internally — notably int64 vs int32,
-    /// both of which map to `DType::Int` in luminal but must be reported
-    /// back to PyTorch with their original precision.
-    pub output_dtypes: Vec<u32>,
+    pub output_dtypes: Vec<DType>,
    pub input_shape_exprs: Vec<Vec<Expression>>,
    pub dim_param_map: DimParamMap,
 }
@@ -129,9 +63,7 @@ pub struct CompiledGraph {
    pub output_names: Vec<String>,
    pub output_shapes: Vec<Vec<usize>>,
    pub output_shape_exprs: Vec<Vec<Expression>>,
-    /// Output dtypes as PT2 dtype codes (preserves int64 / int32 distinction
-    /// that luminal collapses to `DType::Int` internally).
-    pub output_dtypes: Vec<u32>,
+    pub output_dtypes: Vec<DType>,
    pub input_shape_exprs: Vec<Vec<Expression>>,
    pub dim_param_map: DimParamMap,
 }
@@ -158,21 +90,17 @@ impl CompiledGraph {
            input_shape_exprs,
            dim_param_map,
        } = translation;
-        let WeightData {
-            weights,
-            tensor_sizes,
-            device_ptrs,
-        } = weight_data;

-        // Build compile args from WeightData.
+        // Build compile args from WeightData (convert TypedData -> raw bytes + dtype)
        let compile_args = BackendCompileArgs {
            search_iters,
-            weights: weights
+            weights: weight_data
+                .weights
                .iter()
                .map(|(label, td)| (label.clone(), td.bytes.clone(), td.dtype))
                .collect(),
-            tensor_sizes,
-            device_ptrs,
+            tensor_sizes: weight_data.tensor_sizes,
+            device_ptrs: weight_data.device_ptrs,
        };

        // Create backend via the factory directly
@@ -291,27 +219,17 @@ impl CompiledGraph {
    }

    /// Auto-detect and set dynamic dimensions from input tensor shapes.
-    ///
-    /// For each user input we walk the symbolic shape expressions side-by-side
-    /// with the concrete sizes Dynamo handed us at runtime and try to recover
-    /// each unbound variable's value. Two cases are handled:
-    ///
-    ///   * Bare-variable dim (`s`): set directly from the size.
-    ///   * Single-variable affine dim (`a*s + b`): solve `s = (size - b)/a`
-    ///     by sampling the expression at two probe points to extract the
-    ///     slope, recovering the intercept, and verifying that plugging the
-    ///     recovered value back through `exec_single_var_checked` reproduces
-    ///     the observed size. The verification step rejects everything
-    ///     non-affine (`s*s`, `min(s, 8)`, etc.) without committing a wrong
-    ///     guess to `dyn_map`.
-    ///
-    /// Multi-variable dims are skipped here; another input's shape — or an
-    /// explicit `set_dim` call — is expected to bind those.
+    /// For each user input, matches the concrete shape against its symbolic
+    /// shape expressions and sets the corresponding dyn_map entries.
    fn auto_set_dims_from_input_shapes(&mut self, input_shapes: Vec<Vec<usize>>) {
        for (shape_exprs, shape) in self.input_shape_exprs.iter().zip(input_shapes.iter()) {
            for (dim_expr, &dim_val) in shape_exprs.iter().zip(shape.iter()) {
-                if let Some((var, value)) = solve_single_var_dim(dim_expr, dim_val) {
-                    self.graph.set_dim(var, value);
+                // Check if this expression is a bare symbolic variable
+                let terms = dim_expr.terms.read();
+                if terms.len() == 1
+                    && let luminal::shape::Term::Var(c) = terms[0]
+                {
+                    self.graph.set_dim(c, dim_val);
                }
            }
        }
@@ -391,7 +309,7 @@ impl CompiledGraph {
        Ok(())
    }

-    /// Register a weight from a device pointer (e.g. "fc1.weight"). Zero-copy on device.
+    /// Set a weight from a device pointer (e.g. "fc1.weight"). Zero-copy on device.
    /// Requires a GPU backend.
    fn set_weight_device_ptr(
        &mut self,
@@ -452,7 +370,7 @@ impl CompiledGraph {
        Ok(self.runtime.output_is_zero_copy(*node_id))
    }

-    /// Register a weight tensor from a CPU host pointer, matching by Input node label (dtype-aware).
+    /// Set a weight tensor from a CPU host pointer, matching by Input node label (dtype-aware).
    /// `n_bytes` is the total byte count. `dtype_code` uses PT2 numbering (7=f32, 6=f16, 13=bf16, etc.).
    fn set_weight_from_ptr(
        &mut self,
@@ -487,7 +405,10 @@ impl CompiledGraph {
    /// Get the PT2 dtype codes for all outputs (in order).
    #[getter]
    fn output_dtypes(&self) -> Vec<u32> {
-        self.output_dtypes.clone()
+        self.output_dtypes
+            .iter()
+            .map(|d| luminal_dtype_to_pt2_code(*d))
+            .collect()
    }

    /// Get output tensor data by name as f32 (copies to host).
--- a/crates/luminal_python/rust/src/lib.rs
+++ b/crates/luminal_python/rust/src/lib.rs
@@ -3,7 +3,6 @@ pub mod typed_data;

 // PT2 modules
 mod pt2_compiled_model;
-mod pt2_expr;
 mod pt2_parser;
 mod pt2_schema;
 mod pt2_util;
--- a/Show More
+++ b/Show More