cuda run successful

2026-06-01 21:49:47 +09:00 · 2025-09-02 22:51:56 +00:00
parent d2339084ca
commit 3c99f23a8b
6 changed files with 148 additions and 91 deletions
--- a/crates/luminal_2/src/codegen.rs
+++ b/crates/luminal_2/src/codegen.rs
@@ -902,6 +902,11 @@ fn make_kernel(
                c_inner_stride,
                k_outer_loops,
            } => {
+                if cfg!(feature = "cuda") {
+                    // CUDA build: skip / fallback
+                    return None; // or generate a non-TC matmul
+                }
+
                let mut srcs = kernel_graph
                    .edges_directed(node, Direction::Incoming)
                    .sorted_by_key(|e| e.id())
--- a/crates/luminal_2/src/extract.rs
+++ b/crates/luminal_2/src/extract.rs
@@ -48,6 +48,24 @@ const INVALID_IR: &[&str] = &[
    "TiledMatmulAcc",
 ];

+#[cfg(feature = "metal")]
+#[inline]
+fn with_autoreleasepool<F, R>(f: F) -> R
+where
+    F: FnOnce() -> R,
+{
+    objc2::rc::autoreleasepool(|_| f())
+}
+
+#[cfg(feature = "cuda")]
+#[inline]
+fn with_autoreleasepool<F, R>(f: F) -> R
+where
+    F: FnOnce() -> R,
+{
+    f()
+}
+
 type Cost = u128; // Execution time in microseconds

 fn is_expression_enode(enode_label: &str) -> bool {
@@ -729,60 +747,6 @@ pub fn extraction_to_graph(
    g
 }

-#[cfg(feature = "cuda")]
-fn cost<'a>(
-    kernels: &StableGraph<Kernel, (usize, usize), Directed>,
-    inputs: &[(NodeIndex, InitData)],
-    gmem_mapping: &HashMap<NodeIndex, usize>,
-    dyn_vars: &FxHashMap<char, usize>,
-) -> Option<(Cost, Vec<Vec<f32>>)> {
-    let (int_buffers, int_buffer_map) = assign_buffers(&kernels);
-    let compiled_kernels = compile_kernels(&kernels);
-    let ctx = CudaContext::new(0).unwrap(); // will need to expand beyond single host
-    // allocation
-    let mut inputs = inputs
-        .into_iter()
-        .map(|(n, b)| {
-            (
-                gmem_mapping[n],
-                (
-                    copy_cuda_buffer(&b.clone().to_vec(dyn_vars), ctx.clone()),
-                    false,
-                ),
-            )
-        })
-        .collect::<FxHashMap<_, _>>();
-    for _ in 0..WARMUP_TRIALS {
-        run_graph(
-            &mut inputs,
-            &kernels,
-            dyn_vars,
-            &compiled_kernels,
-            &int_buffers,
-            &int_buffer_map,
-        );
-    }
-    let mut micros = vec![];
-    let mut outputs = vec![];
-    let mut m;
-    for _ in 0..TRIALS {
-        (outputs, m) = run_graph(
-            &mut inputs,
-            &kernels,
-            dyn_vars,
-            &compiled_kernels,
-            &int_buffers,
-            &int_buffer_map,
-        );
-        micros.push(m);
-    }
-    Some((
-        micros.into_iter().sum::<u128>() / TRIALS as u128,
-        outputs.iter().map(copy_cuda_buffer_back).collect_vec(),
-    ))
-}
-
-#[cfg(feature = "metal")]
 fn cost<'a>(
    graph: &StableGraph<GraphTerm, ()>,
    kernels: &StableGraph<Kernel, (usize, usize), Directed>,
@@ -790,11 +754,14 @@ fn cost<'a>(
    gmem_mapping: &HashMap<NodeIndex, usize>,
    dyn_vars: &FxHashMap<char, usize>,
 ) -> Option<(Cost, Vec<Vec<f32>>)> {
-    autoreleasepool(|_| {
+    with_autoreleasepool(|| {
        // Get buffer info
        let (int_buffers, int_buffer_map) = assign_buffers(&kernels);
        let compiled_kernels = compile_kernels(&kernels);
+        #[cfg(feature = "metal")]
        let device = MTLCreateSystemDefaultDevice().unwrap();
+        #[cfg(feature = "cuda")]
+        let ctx = CudaContext::new(0).unwrap(); // will need to expand beyond single host
        // Copy input buffers over
        let mut inputs = inputs
            .into_iter()
@@ -802,7 +769,10 @@ fn cost<'a>(
                (
                    gmem_mapping[n],
                    (
+                        #[cfg(feature = "metal")]
                        copy_metal_buffer(&b.clone().to_vec(dyn_vars), &device),
+                        #[cfg(feature = "cuda")]
+                        copy_cuda_buffer(&b.clone().to_vec(dyn_vars), ctx.clone()),
                        false,
                    ),
                )
@@ -810,6 +780,7 @@ fn cost<'a>(
            .collect::<FxHashMap<_, _>>();
        // Warm up resources (buffer allocation, kernel compiler, etc.)
        for _ in 0..WARMUP_TRIALS {
+            #[cfg(feature = "metal")]
            run_graph(
                &graph,
                &mut inputs,
@@ -819,27 +790,56 @@ fn cost<'a>(
                &int_buffers,
                &int_buffer_map,
            );
+            #[cfg(feature = "cuda")]
+            run_graph(
+                &mut inputs,
+                &kernels,
+                dyn_vars,
+                &compiled_kernels,
+                &int_buffers,
+                &int_buffer_map,
+            );
        }
        // Test runtime
        let mut micros = vec![];
        let mut outputs = vec![];
-        let mut m;

        for _ in 0..TRIALS {
-            (outputs, m) = run_graph(
-                &graph,
-                &mut inputs,
-                &kernels,
-                dyn_vars,
-                &compiled_kernels,
-                &int_buffers,
-                &int_buffer_map,
-            );
-            micros.push(m);
+            let (o, m_val) = {
+                #[cfg(feature = "metal")]
+                {
+                    run_graph(
+                        &graph,
+                        &mut inputs,
+                        &kernels,
+                        dyn_vars,
+                        &compiled_kernels,
+                        &int_buffers,
+                        &int_buffer_map,
+                    )
+                }
+
+                #[cfg(feature = "cuda")]
+                {
+                    run_graph(
+                        &mut inputs,
+                        &kernels,
+                        dyn_vars,
+                        &compiled_kernels,
+                        &int_buffers,
+                        &int_buffer_map,
+                    )
+                }
+            };
+            outputs = o;
+            micros.push(m_val);
        }
        Some((
            micros.into_iter().sum::<u128>() / TRIALS as u128,
+            #[cfg(feature = "metal")]
            outputs.iter().map(copy_metal_buffer_back).collect_vec(),
+            #[cfg(feature = "cuda")]
+            outputs.iter().map(copy_cuda_buffer_back).collect_vec(),
        ))
    })
 }
--- a/crates/luminal_2/src/run.rs
+++ b/crates/luminal_2/src/run.rs
@@ -464,8 +464,6 @@ pub fn run_graph(
    objc2::rc::autoreleasepool(|_| {
        use objc2_metal::{MTLCommandQueue, MTLCreateSystemDefaultDevice, MTLDevice};

-        // println!("deep down in the mines");
-
        let device = MTLCreateSystemDefaultDevice().unwrap();
        let queue = device.newCommandQueue().expect("No command queue");
        let command_buffer = queue.commandBuffer().unwrap();
--- a/demos/matmul/Cargo.toml
+++ b/demos/matmul/Cargo.toml
@@ -5,7 +5,7 @@ edition = "2021"

 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 [features]
-default = ["metal"]
+default = ["cuda"]
 cuda = ["dep:cudarc", "dep:luminal_cuda", "luminal_2/cuda"]
 metal = [
    "dep:objc2",
--- a/demos/matmul/kernel_log.txt
+++ b/demos/matmul/kernel_log.txt
@@ -0,0 +1,39 @@
+Compiling kernel:
+extern "C" __global__ void kernel_name(float* a, float* b, float* c) {
+	int loop_e = blockIdx.x;
+	float* f = a + (loop_e*262144);
+	float* g = c + (512*loop_e);
+	int loop_h = blockIdx.y;
+	float* i = f + (loop_h*4096);
+	float* j = g + (loop_h*8);
+	int loop_k = threadIdx.x;
+	float* l = i + (512*loop_k);
+	float* m = j + loop_k;
+	float n[1] = {0.0};
+	for (int load = 0; load < 1; ++load) {
+		n[0] = *(b + 0);
+	}
+	for (int loop_o = 0; loop_o < 512; ++loop_o) {
+		float* p = l + loop_o;
+		float q = *n + *p;
+		*n = q;
+	}
+	*m = *n;
+}
+
+Compiling kernel:
+extern "C" __global__ void kernel_name(float* a, float* b, float* c) {
+	int loop_e = blockIdx.x;
+	float* f = a + (512*loop_e);
+	float* g = c + (loop_e*262144);
+	int loop_h = blockIdx.y;
+	float* i = b + (((loop_h/8)*8)+(loop_h%8));
+	float* j = g + (((loop_h/8)*4096)+((loop_h%8)*512));
+	int loop_k = threadIdx.x;
+	float* l = i + (512*loop_k);
+	float* m = f + loop_k;
+	float* n = j + loop_k;
+	float o = *l * *m;
+	*n = o;
+}
+
--- a/demos/matmul/src/main.rs
+++ b/demos/matmul/src/main.rs
@@ -40,6 +40,12 @@ fn main() {
    with_autoreleasepool(|| {
        #[cfg(feature = "cuda")]
        println!("CUDA MODE ENABLED");
+
+        #[cfg(feature = "metal")]
+        let arch = GPUArch::Metal(HashMap::default());
+        #[cfg(feature = "cuda")]
+        let arch = GPUArch::CUDA;
+
        #[allow(non_snake_case)]
        let (M, K, N) = (512, 512, 512);
        let mut cx = Graph::new();
@@ -52,12 +58,8 @@ fn main() {
            let graph = new_graph.node_weight_mut(graph_node).unwrap();
            // luminal_2::debug::display_graph(&graph);
            let inputs = make_test_inputs(graph, &cx.dyn_map, &accs);
-            #[cfg(feature = "metal")]
-            let arch = GPUArch::Metal(HashMap::default());
-            #[cfg(feature = "cuda")]
-            let arch = GPUArch::CUDA;

-            let searched_graph = search(graph, 3, &inputs, arch, &cx.dyn_map).unwrap();
+            let searched_graph = search(graph, 3, &inputs, arch.clone(), &cx.dyn_map).unwrap();
            // adjust meta-edges
            let old_output = graph.externals(Direction::Outgoing).next().unwrap();
            let new_output = searched_graph
@@ -120,14 +122,8 @@ fn main() {
        for (k, v) in mapping {
            unified_map.insert(k, meta_to_final[&v]);
        }
-        let (kernels, gmem_mapping) = codegen(
-            graph.clone(),
-            outputs,
-            GPUArch::Metal(HashMap::default()),
-            0,
-            &HashMap::default(),
-        )
-        .unwrap();
+        let (kernels, gmem_mapping) =
+            codegen(graph.clone(), outputs, arch, 0, &HashMap::default()).unwrap();

        let compiled = compile_kernels(&kernels);
        let (int_buffers, int_buffer_map) = assign_buffers(&kernels);
@@ -163,15 +159,34 @@ fn main() {
            }
        }

-        let (outputs, _) = run_graph(
-            &graph,
-            &mut inputs,
-            &kernels,
-            &FxHashMap::default(),
-            &compiled,
-            &int_buffers,
-            &int_buffer_map,
-        );
+        println!("DIDYOU REACH HERE?");
+
+        let (outputs, _) = {
+            #[cfg(feature = "metal")]
+            {
+                run_graph(
+                    &graph,
+                    &mut inputs,
+                    &kernels,
+                    &FxHashMap::default(),
+                    &compiled,
+                    &int_buffers,
+                    &int_buffer_map,
+                )
+            }
+
+            #[cfg(feature = "cuda")]
+            {
+                run_graph(
+                    &mut inputs,
+                    &kernels,
+                    &FxHashMap::default(),
+                    &compiled,
+                    &int_buffers,
+                    &int_buffer_map,
+                )
+            }
+        };
        println!("{:?}", &copy_buffer_back(&outputs[0])[..10]);
    });
 }