cuda run successful

This commit is contained in:
Matthew Gunton
2025-09-02 22:51:56 +00:00
parent d2339084ca
commit 3c99f23a8b
6 changed files with 148 additions and 91 deletions

View File

@@ -902,6 +902,11 @@ fn make_kernel(
c_inner_stride,
k_outer_loops,
} => {
if cfg!(feature = "cuda") {
// CUDA build: skip / fallback
return None; // or generate a non-TC matmul
}
let mut srcs = kernel_graph
.edges_directed(node, Direction::Incoming)
.sorted_by_key(|e| e.id())

View File

@@ -48,6 +48,24 @@ const INVALID_IR: &[&str] = &[
"TiledMatmulAcc",
];
#[cfg(feature = "metal")]
#[inline]
fn with_autoreleasepool<F, R>(f: F) -> R
where
F: FnOnce() -> R,
{
objc2::rc::autoreleasepool(|_| f())
}
#[cfg(feature = "cuda")]
#[inline]
fn with_autoreleasepool<F, R>(f: F) -> R
where
F: FnOnce() -> R,
{
f()
}
type Cost = u128; // Execution time in microseconds
fn is_expression_enode(enode_label: &str) -> bool {
@@ -729,60 +747,6 @@ pub fn extraction_to_graph(
g
}
#[cfg(feature = "cuda")]
fn cost<'a>(
kernels: &StableGraph<Kernel, (usize, usize), Directed>,
inputs: &[(NodeIndex, InitData)],
gmem_mapping: &HashMap<NodeIndex, usize>,
dyn_vars: &FxHashMap<char, usize>,
) -> Option<(Cost, Vec<Vec<f32>>)> {
let (int_buffers, int_buffer_map) = assign_buffers(&kernels);
let compiled_kernels = compile_kernels(&kernels);
let ctx = CudaContext::new(0).unwrap(); // will need to expand beyond single host
// allocation
let mut inputs = inputs
.into_iter()
.map(|(n, b)| {
(
gmem_mapping[n],
(
copy_cuda_buffer(&b.clone().to_vec(dyn_vars), ctx.clone()),
false,
),
)
})
.collect::<FxHashMap<_, _>>();
for _ in 0..WARMUP_TRIALS {
run_graph(
&mut inputs,
&kernels,
dyn_vars,
&compiled_kernels,
&int_buffers,
&int_buffer_map,
);
}
let mut micros = vec![];
let mut outputs = vec![];
let mut m;
for _ in 0..TRIALS {
(outputs, m) = run_graph(
&mut inputs,
&kernels,
dyn_vars,
&compiled_kernels,
&int_buffers,
&int_buffer_map,
);
micros.push(m);
}
Some((
micros.into_iter().sum::<u128>() / TRIALS as u128,
outputs.iter().map(copy_cuda_buffer_back).collect_vec(),
))
}
#[cfg(feature = "metal")]
fn cost<'a>(
graph: &StableGraph<GraphTerm, ()>,
kernels: &StableGraph<Kernel, (usize, usize), Directed>,
@@ -790,11 +754,14 @@ fn cost<'a>(
gmem_mapping: &HashMap<NodeIndex, usize>,
dyn_vars: &FxHashMap<char, usize>,
) -> Option<(Cost, Vec<Vec<f32>>)> {
autoreleasepool(|_| {
with_autoreleasepool(|| {
// Get buffer info
let (int_buffers, int_buffer_map) = assign_buffers(&kernels);
let compiled_kernels = compile_kernels(&kernels);
#[cfg(feature = "metal")]
let device = MTLCreateSystemDefaultDevice().unwrap();
#[cfg(feature = "cuda")]
let ctx = CudaContext::new(0).unwrap(); // will need to expand beyond single host
// Copy input buffers over
let mut inputs = inputs
.into_iter()
@@ -802,7 +769,10 @@ fn cost<'a>(
(
gmem_mapping[n],
(
#[cfg(feature = "metal")]
copy_metal_buffer(&b.clone().to_vec(dyn_vars), &device),
#[cfg(feature = "cuda")]
copy_cuda_buffer(&b.clone().to_vec(dyn_vars), ctx.clone()),
false,
),
)
@@ -810,6 +780,7 @@ fn cost<'a>(
.collect::<FxHashMap<_, _>>();
// Warm up resources (buffer allocation, kernel compiler, etc.)
for _ in 0..WARMUP_TRIALS {
#[cfg(feature = "metal")]
run_graph(
&graph,
&mut inputs,
@@ -819,27 +790,56 @@ fn cost<'a>(
&int_buffers,
&int_buffer_map,
);
#[cfg(feature = "cuda")]
run_graph(
&mut inputs,
&kernels,
dyn_vars,
&compiled_kernels,
&int_buffers,
&int_buffer_map,
);
}
// Test runtime
let mut micros = vec![];
let mut outputs = vec![];
let mut m;
for _ in 0..TRIALS {
(outputs, m) = run_graph(
&graph,
&mut inputs,
&kernels,
dyn_vars,
&compiled_kernels,
&int_buffers,
&int_buffer_map,
);
micros.push(m);
let (o, m_val) = {
#[cfg(feature = "metal")]
{
run_graph(
&graph,
&mut inputs,
&kernels,
dyn_vars,
&compiled_kernels,
&int_buffers,
&int_buffer_map,
)
}
#[cfg(feature = "cuda")]
{
run_graph(
&mut inputs,
&kernels,
dyn_vars,
&compiled_kernels,
&int_buffers,
&int_buffer_map,
)
}
};
outputs = o;
micros.push(m_val);
}
Some((
micros.into_iter().sum::<u128>() / TRIALS as u128,
#[cfg(feature = "metal")]
outputs.iter().map(copy_metal_buffer_back).collect_vec(),
#[cfg(feature = "cuda")]
outputs.iter().map(copy_cuda_buffer_back).collect_vec(),
))
})
}

View File

@@ -464,8 +464,6 @@ pub fn run_graph(
objc2::rc::autoreleasepool(|_| {
use objc2_metal::{MTLCommandQueue, MTLCreateSystemDefaultDevice, MTLDevice};
// println!("deep down in the mines");
let device = MTLCreateSystemDefaultDevice().unwrap();
let queue = device.newCommandQueue().expect("No command queue");
let command_buffer = queue.commandBuffer().unwrap();

View File

@@ -5,7 +5,7 @@ edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[features]
default = ["metal"]
default = ["cuda"]
cuda = ["dep:cudarc", "dep:luminal_cuda", "luminal_2/cuda"]
metal = [
"dep:objc2",

View File

@@ -0,0 +1,39 @@
Compiling kernel:
extern "C" __global__ void kernel_name(float* a, float* b, float* c) {
int loop_e = blockIdx.x;
float* f = a + (loop_e*262144);
float* g = c + (512*loop_e);
int loop_h = blockIdx.y;
float* i = f + (loop_h*4096);
float* j = g + (loop_h*8);
int loop_k = threadIdx.x;
float* l = i + (512*loop_k);
float* m = j + loop_k;
float n[1] = {0.0};
for (int load = 0; load < 1; ++load) {
n[0] = *(b + 0);
}
for (int loop_o = 0; loop_o < 512; ++loop_o) {
float* p = l + loop_o;
float q = *n + *p;
*n = q;
}
*m = *n;
}
Compiling kernel:
extern "C" __global__ void kernel_name(float* a, float* b, float* c) {
int loop_e = blockIdx.x;
float* f = a + (512*loop_e);
float* g = c + (loop_e*262144);
int loop_h = blockIdx.y;
float* i = b + (((loop_h/8)*8)+(loop_h%8));
float* j = g + (((loop_h/8)*4096)+((loop_h%8)*512));
int loop_k = threadIdx.x;
float* l = i + (512*loop_k);
float* m = f + loop_k;
float* n = j + loop_k;
float o = *l * *m;
*n = o;
}

View File

@@ -40,6 +40,12 @@ fn main() {
with_autoreleasepool(|| {
#[cfg(feature = "cuda")]
println!("CUDA MODE ENABLED");
#[cfg(feature = "metal")]
let arch = GPUArch::Metal(HashMap::default());
#[cfg(feature = "cuda")]
let arch = GPUArch::CUDA;
#[allow(non_snake_case)]
let (M, K, N) = (512, 512, 512);
let mut cx = Graph::new();
@@ -52,12 +58,8 @@ fn main() {
let graph = new_graph.node_weight_mut(graph_node).unwrap();
// luminal_2::debug::display_graph(&graph);
let inputs = make_test_inputs(graph, &cx.dyn_map, &accs);
#[cfg(feature = "metal")]
let arch = GPUArch::Metal(HashMap::default());
#[cfg(feature = "cuda")]
let arch = GPUArch::CUDA;
let searched_graph = search(graph, 3, &inputs, arch, &cx.dyn_map).unwrap();
let searched_graph = search(graph, 3, &inputs, arch.clone(), &cx.dyn_map).unwrap();
// adjust meta-edges
let old_output = graph.externals(Direction::Outgoing).next().unwrap();
let new_output = searched_graph
@@ -120,14 +122,8 @@ fn main() {
for (k, v) in mapping {
unified_map.insert(k, meta_to_final[&v]);
}
let (kernels, gmem_mapping) = codegen(
graph.clone(),
outputs,
GPUArch::Metal(HashMap::default()),
0,
&HashMap::default(),
)
.unwrap();
let (kernels, gmem_mapping) =
codegen(graph.clone(), outputs, arch, 0, &HashMap::default()).unwrap();
let compiled = compile_kernels(&kernels);
let (int_buffers, int_buffer_map) = assign_buffers(&kernels);
@@ -163,15 +159,34 @@ fn main() {
}
}
let (outputs, _) = run_graph(
&graph,
&mut inputs,
&kernels,
&FxHashMap::default(),
&compiled,
&int_buffers,
&int_buffer_map,
);
println!("DIDYOU REACH HERE?");
let (outputs, _) = {
#[cfg(feature = "metal")]
{
run_graph(
&graph,
&mut inputs,
&kernels,
&FxHashMap::default(),
&compiled,
&int_buffers,
&int_buffer_map,
)
}
#[cfg(feature = "cuda")]
{
run_graph(
&mut inputs,
&kernels,
&FxHashMap::default(),
&compiled,
&int_buffers,
&int_buffer_map,
)
}
};
println!("{:?}", &copy_buffer_back(&outputs[0])[..10]);
});
}