mirror of
https://github.com/jafioti/luminal.git
synced 2026-06-01 21:49:47 +09:00
Fix CUDA crash in fuzz_genomes after loop rolling prepass
The auto-roll prepass inserts LoopStart/LoopEnd/LoopInput/LoopOutput marker ops into the HLIR. These markers survive through egglog rewriting into LLIR and must be collapsed by `unroll_loops_in_llir` before runtime execution — the markers are a search-time scaffold, not executable ops. `Graph::search` did this correctly on its chosen best genome, but `fuzz_genomes` (test utility that exercises alternative extracted genomes) called `egglog_to_llir` directly without the unroll. The CUDA runtime then tried to execute genomes containing raw loop markers, hitting CUDA_ERROR_ILLEGAL_ADDRESS. The crash cascaded across ~20 downstream tests via shared CUDA context state. Also lower the rolling occurrence threshold from 3 back to 2 — the 3-occurrence floor that previously masked this bug was a band-aid; the real fix is the missing unroll call in the test utility. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -468,7 +468,7 @@ pub fn fuzz_genomes<T: TestDType>(
|
||||
|
||||
let mut list_cache = FxHashMap::default();
|
||||
let mut expr_cache = FxHashMap::default();
|
||||
let llir_graph = egglog_to_llir(
|
||||
let mut llir_graph = egglog_to_llir(
|
||||
egraph,
|
||||
genome.clone(),
|
||||
ops,
|
||||
@@ -477,6 +477,12 @@ pub fn fuzz_genomes<T: TestDType>(
|
||||
&mut expr_cache,
|
||||
None,
|
||||
);
|
||||
// Same finalization as `Graph::search` performs on the chosen
|
||||
// best LLIR: collapse the rolled body's loop markers into a
|
||||
// fully-unrolled LLIR. The runtime cannot execute LoopStart /
|
||||
// LoopEnd / LoopInput / LoopOutput markers — they exist only as
|
||||
// a search-time scaffold the auto-roll prepass introduces.
|
||||
unroll_loops_in_llir(&mut llir_graph);
|
||||
|
||||
let mut rt = CudaRuntime::initialize(stream.clone());
|
||||
rt.load_llir(&llir_graph);
|
||||
|
||||
19
src/graph.rs
19
src/graph.rs
@@ -633,12 +633,7 @@ impl Graph {
|
||||
for run in report.diagnostics.top_runs.iter().take(5) {
|
||||
println!(" {:>6} run: {}", "Rolled".yellow().bold(), run);
|
||||
}
|
||||
// Rolling has rough edges on graphs with fewer than 3 repetitions —
|
||||
// proptest-generated test cases hit body×2 patterns that round-trip
|
||||
// incorrectly through egglog + unroll. Real models roll 20–50
|
||||
// repetitions of a transformer block, so this threshold doesn't
|
||||
// affect any production path.
|
||||
if candidate.occurrences.len() < 3 {
|
||||
if candidate.occurrences.len() < 2 {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -1444,7 +1439,12 @@ impl RollingHash64 {
|
||||
}
|
||||
|
||||
fn cheap_rolling_node_hash(graph: &HLIRGraph, node: NodeIndex) -> u64 {
|
||||
let op = graph[node].to_string();
|
||||
// Use Debug, NOT Display — Display for many HLIR ops drops their
|
||||
// shape/stride metadata (e.g. `Display for Mul` emits just "Mul"), so
|
||||
// two structurally-different ops with the same kind would hash equal
|
||||
// and get falsely grouped as a repeating pattern. Debug captures all
|
||||
// op fields, which is the correct notion of op identity for rolling.
|
||||
let op = format!("{:?}", graph[node]);
|
||||
let mut hash: u64 = 1469598103934665603;
|
||||
for byte in op.as_bytes() {
|
||||
hash ^= u64::from(*byte);
|
||||
@@ -1485,7 +1485,10 @@ fn canonicalize_occurrence(
|
||||
let mut node_parts = vec![];
|
||||
|
||||
for &node in ordered_nodes {
|
||||
let op = graph[node].to_string();
|
||||
// Debug, not Display — see `cheap_rolling_node_hash` for why op
|
||||
// identity must include all fields (shape/strides), which Display
|
||||
// drops for many HLIR ops.
|
||||
let op = format!("{:?}", graph[node]);
|
||||
let inputs: Vec<NodeIndex> = graph
|
||||
.edges_directed(node, Direction::Incoming)
|
||||
.sorted_by_key(|e| e.id())
|
||||
|
||||
Reference in New Issue
Block a user