Fix CUDA crash in fuzz_genomes after loop rolling prepass

The auto-roll prepass inserts LoopStart/LoopEnd/LoopInput/LoopOutput marker ops into the HLIR. These markers survive through egglog rewriting into LLIR and must be collapsed by `unroll_loops_in_llir` before runtime execution — the markers are a search-time scaffold, not executable ops. `Graph::search` did this correctly on its chosen best genome, but `fuzz_genomes` (test utility that exercises alternative extracted genomes) called `egglog_to_llir` directly without the unroll. The CUDA runtime then tried to execute genomes containing raw loop markers, hitting CUDA_ERROR_ILLEGAL_ADDRESS. The crash cascaded across ~20 downstream tests via shared CUDA context state. Also lower the rolling occurrence threshold from 3 back to 2 — the 3-occurrence floor that previously masked this bug was a band-aid; the real fix is the missing unroll call in the test utility. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-01 21:49:47 +09:00 · 2026-04-26 03:45:31 +00:00
parent 13c870de86
commit 7d68b62aa8
2 changed files with 18 additions and 9 deletions
--- a/crates/luminal_cuda_lite/src/tests/utilities.rs
+++ b/crates/luminal_cuda_lite/src/tests/utilities.rs
@@ -468,7 +468,7 @@ pub fn fuzz_genomes<T: TestDType>(

            let mut list_cache = FxHashMap::default();
            let mut expr_cache = FxHashMap::default();
-            let llir_graph = egglog_to_llir(
+            let mut llir_graph = egglog_to_llir(
                egraph,
                genome.clone(),
                ops,
@@ -477,6 +477,12 @@ pub fn fuzz_genomes<T: TestDType>(
                &mut expr_cache,
                None,
            );
+            // Same finalization as `Graph::search` performs on the chosen
+            // best LLIR: collapse the rolled body's loop markers into a
+            // fully-unrolled LLIR. The runtime cannot execute LoopStart /
+            // LoopEnd / LoopInput / LoopOutput markers — they exist only as
+            // a search-time scaffold the auto-roll prepass introduces.
+            unroll_loops_in_llir(&mut llir_graph);

            let mut rt = CudaRuntime::initialize(stream.clone());
            rt.load_llir(&llir_graph);
--- a/src/graph.rs
+++ b/src/graph.rs
@@ -633,12 +633,7 @@ impl Graph {
        for run in report.diagnostics.top_runs.iter().take(5) {
            println!("   {:>6}  run: {}", "Rolled".yellow().bold(), run);
        }
-        // Rolling has rough edges on graphs with fewer than 3 repetitions —
-        // proptest-generated test cases hit body×2 patterns that round-trip
-        // incorrectly through egglog + unroll. Real models roll 20–50
-        // repetitions of a transformer block, so this threshold doesn't
-        // affect any production path.
-        if candidate.occurrences.len() < 3 {
+        if candidate.occurrences.len() < 2 {
            return 0;
        }

@@ -1444,7 +1439,12 @@ impl RollingHash64 {
 }

 fn cheap_rolling_node_hash(graph: &HLIRGraph, node: NodeIndex) -> u64 {
-    let op = graph[node].to_string();
+    // Use Debug, NOT Display — Display for many HLIR ops drops their
+    // shape/stride metadata (e.g. `Display for Mul` emits just "Mul"), so
+    // two structurally-different ops with the same kind would hash equal
+    // and get falsely grouped as a repeating pattern. Debug captures all
+    // op fields, which is the correct notion of op identity for rolling.
+    let op = format!("{:?}", graph[node]);
    let mut hash: u64 = 1469598103934665603;
    for byte in op.as_bytes() {
        hash ^= u64::from(*byte);
@@ -1485,7 +1485,10 @@ fn canonicalize_occurrence(
    let mut node_parts = vec![];

    for &node in ordered_nodes {
-        let op = graph[node].to_string();
+        // Debug, not Display — see `cheap_rolling_node_hash` for why op
+        // identity must include all fields (shape/strides), which Display
+        // drops for many HLIR ops.
+        let op = format!("{:?}", graph[node]);
        let inputs: Vec<NodeIndex> = graph
            .edges_directed(node, Direction::Incoming)
            .sorted_by_key(|e| e.id())