mirror of
https://github.com/jafioti/luminal.git
synced 2026-06-01 21:49:47 +09:00
cuda run successful
This commit is contained in:
@@ -902,6 +902,11 @@ fn make_kernel(
|
||||
c_inner_stride,
|
||||
k_outer_loops,
|
||||
} => {
|
||||
if cfg!(feature = "cuda") {
|
||||
// CUDA build: skip / fallback
|
||||
return None; // or generate a non-TC matmul
|
||||
}
|
||||
|
||||
let mut srcs = kernel_graph
|
||||
.edges_directed(node, Direction::Incoming)
|
||||
.sorted_by_key(|e| e.id())
|
||||
|
||||
@@ -48,6 +48,24 @@ const INVALID_IR: &[&str] = &[
|
||||
"TiledMatmulAcc",
|
||||
];
|
||||
|
||||
#[cfg(feature = "metal")]
|
||||
#[inline]
|
||||
fn with_autoreleasepool<F, R>(f: F) -> R
|
||||
where
|
||||
F: FnOnce() -> R,
|
||||
{
|
||||
objc2::rc::autoreleasepool(|_| f())
|
||||
}
|
||||
|
||||
#[cfg(feature = "cuda")]
|
||||
#[inline]
|
||||
fn with_autoreleasepool<F, R>(f: F) -> R
|
||||
where
|
||||
F: FnOnce() -> R,
|
||||
{
|
||||
f()
|
||||
}
|
||||
|
||||
type Cost = u128; // Execution time in microseconds
|
||||
|
||||
fn is_expression_enode(enode_label: &str) -> bool {
|
||||
@@ -729,60 +747,6 @@ pub fn extraction_to_graph(
|
||||
g
|
||||
}
|
||||
|
||||
#[cfg(feature = "cuda")]
|
||||
fn cost<'a>(
|
||||
kernels: &StableGraph<Kernel, (usize, usize), Directed>,
|
||||
inputs: &[(NodeIndex, InitData)],
|
||||
gmem_mapping: &HashMap<NodeIndex, usize>,
|
||||
dyn_vars: &FxHashMap<char, usize>,
|
||||
) -> Option<(Cost, Vec<Vec<f32>>)> {
|
||||
let (int_buffers, int_buffer_map) = assign_buffers(&kernels);
|
||||
let compiled_kernels = compile_kernels(&kernels);
|
||||
let ctx = CudaContext::new(0).unwrap(); // will need to expand beyond single host
|
||||
// allocation
|
||||
let mut inputs = inputs
|
||||
.into_iter()
|
||||
.map(|(n, b)| {
|
||||
(
|
||||
gmem_mapping[n],
|
||||
(
|
||||
copy_cuda_buffer(&b.clone().to_vec(dyn_vars), ctx.clone()),
|
||||
false,
|
||||
),
|
||||
)
|
||||
})
|
||||
.collect::<FxHashMap<_, _>>();
|
||||
for _ in 0..WARMUP_TRIALS {
|
||||
run_graph(
|
||||
&mut inputs,
|
||||
&kernels,
|
||||
dyn_vars,
|
||||
&compiled_kernels,
|
||||
&int_buffers,
|
||||
&int_buffer_map,
|
||||
);
|
||||
}
|
||||
let mut micros = vec![];
|
||||
let mut outputs = vec![];
|
||||
let mut m;
|
||||
for _ in 0..TRIALS {
|
||||
(outputs, m) = run_graph(
|
||||
&mut inputs,
|
||||
&kernels,
|
||||
dyn_vars,
|
||||
&compiled_kernels,
|
||||
&int_buffers,
|
||||
&int_buffer_map,
|
||||
);
|
||||
micros.push(m);
|
||||
}
|
||||
Some((
|
||||
micros.into_iter().sum::<u128>() / TRIALS as u128,
|
||||
outputs.iter().map(copy_cuda_buffer_back).collect_vec(),
|
||||
))
|
||||
}
|
||||
|
||||
#[cfg(feature = "metal")]
|
||||
fn cost<'a>(
|
||||
graph: &StableGraph<GraphTerm, ()>,
|
||||
kernels: &StableGraph<Kernel, (usize, usize), Directed>,
|
||||
@@ -790,11 +754,14 @@ fn cost<'a>(
|
||||
gmem_mapping: &HashMap<NodeIndex, usize>,
|
||||
dyn_vars: &FxHashMap<char, usize>,
|
||||
) -> Option<(Cost, Vec<Vec<f32>>)> {
|
||||
autoreleasepool(|_| {
|
||||
with_autoreleasepool(|| {
|
||||
// Get buffer info
|
||||
let (int_buffers, int_buffer_map) = assign_buffers(&kernels);
|
||||
let compiled_kernels = compile_kernels(&kernels);
|
||||
#[cfg(feature = "metal")]
|
||||
let device = MTLCreateSystemDefaultDevice().unwrap();
|
||||
#[cfg(feature = "cuda")]
|
||||
let ctx = CudaContext::new(0).unwrap(); // will need to expand beyond single host
|
||||
// Copy input buffers over
|
||||
let mut inputs = inputs
|
||||
.into_iter()
|
||||
@@ -802,7 +769,10 @@ fn cost<'a>(
|
||||
(
|
||||
gmem_mapping[n],
|
||||
(
|
||||
#[cfg(feature = "metal")]
|
||||
copy_metal_buffer(&b.clone().to_vec(dyn_vars), &device),
|
||||
#[cfg(feature = "cuda")]
|
||||
copy_cuda_buffer(&b.clone().to_vec(dyn_vars), ctx.clone()),
|
||||
false,
|
||||
),
|
||||
)
|
||||
@@ -810,6 +780,7 @@ fn cost<'a>(
|
||||
.collect::<FxHashMap<_, _>>();
|
||||
// Warm up resources (buffer allocation, kernel compiler, etc.)
|
||||
for _ in 0..WARMUP_TRIALS {
|
||||
#[cfg(feature = "metal")]
|
||||
run_graph(
|
||||
&graph,
|
||||
&mut inputs,
|
||||
@@ -819,27 +790,56 @@ fn cost<'a>(
|
||||
&int_buffers,
|
||||
&int_buffer_map,
|
||||
);
|
||||
#[cfg(feature = "cuda")]
|
||||
run_graph(
|
||||
&mut inputs,
|
||||
&kernels,
|
||||
dyn_vars,
|
||||
&compiled_kernels,
|
||||
&int_buffers,
|
||||
&int_buffer_map,
|
||||
);
|
||||
}
|
||||
// Test runtime
|
||||
let mut micros = vec![];
|
||||
let mut outputs = vec![];
|
||||
let mut m;
|
||||
|
||||
for _ in 0..TRIALS {
|
||||
(outputs, m) = run_graph(
|
||||
&graph,
|
||||
&mut inputs,
|
||||
&kernels,
|
||||
dyn_vars,
|
||||
&compiled_kernels,
|
||||
&int_buffers,
|
||||
&int_buffer_map,
|
||||
);
|
||||
micros.push(m);
|
||||
let (o, m_val) = {
|
||||
#[cfg(feature = "metal")]
|
||||
{
|
||||
run_graph(
|
||||
&graph,
|
||||
&mut inputs,
|
||||
&kernels,
|
||||
dyn_vars,
|
||||
&compiled_kernels,
|
||||
&int_buffers,
|
||||
&int_buffer_map,
|
||||
)
|
||||
}
|
||||
|
||||
#[cfg(feature = "cuda")]
|
||||
{
|
||||
run_graph(
|
||||
&mut inputs,
|
||||
&kernels,
|
||||
dyn_vars,
|
||||
&compiled_kernels,
|
||||
&int_buffers,
|
||||
&int_buffer_map,
|
||||
)
|
||||
}
|
||||
};
|
||||
outputs = o;
|
||||
micros.push(m_val);
|
||||
}
|
||||
Some((
|
||||
micros.into_iter().sum::<u128>() / TRIALS as u128,
|
||||
#[cfg(feature = "metal")]
|
||||
outputs.iter().map(copy_metal_buffer_back).collect_vec(),
|
||||
#[cfg(feature = "cuda")]
|
||||
outputs.iter().map(copy_cuda_buffer_back).collect_vec(),
|
||||
))
|
||||
})
|
||||
}
|
||||
|
||||
@@ -464,8 +464,6 @@ pub fn run_graph(
|
||||
objc2::rc::autoreleasepool(|_| {
|
||||
use objc2_metal::{MTLCommandQueue, MTLCreateSystemDefaultDevice, MTLDevice};
|
||||
|
||||
// println!("deep down in the mines");
|
||||
|
||||
let device = MTLCreateSystemDefaultDevice().unwrap();
|
||||
let queue = device.newCommandQueue().expect("No command queue");
|
||||
let command_buffer = queue.commandBuffer().unwrap();
|
||||
|
||||
@@ -5,7 +5,7 @@ edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
[features]
|
||||
default = ["metal"]
|
||||
default = ["cuda"]
|
||||
cuda = ["dep:cudarc", "dep:luminal_cuda", "luminal_2/cuda"]
|
||||
metal = [
|
||||
"dep:objc2",
|
||||
|
||||
39
demos/matmul/kernel_log.txt
Normal file
39
demos/matmul/kernel_log.txt
Normal file
@@ -0,0 +1,39 @@
|
||||
Compiling kernel:
|
||||
extern "C" __global__ void kernel_name(float* a, float* b, float* c) {
|
||||
int loop_e = blockIdx.x;
|
||||
float* f = a + (loop_e*262144);
|
||||
float* g = c + (512*loop_e);
|
||||
int loop_h = blockIdx.y;
|
||||
float* i = f + (loop_h*4096);
|
||||
float* j = g + (loop_h*8);
|
||||
int loop_k = threadIdx.x;
|
||||
float* l = i + (512*loop_k);
|
||||
float* m = j + loop_k;
|
||||
float n[1] = {0.0};
|
||||
for (int load = 0; load < 1; ++load) {
|
||||
n[0] = *(b + 0);
|
||||
}
|
||||
for (int loop_o = 0; loop_o < 512; ++loop_o) {
|
||||
float* p = l + loop_o;
|
||||
float q = *n + *p;
|
||||
*n = q;
|
||||
}
|
||||
*m = *n;
|
||||
}
|
||||
|
||||
Compiling kernel:
|
||||
extern "C" __global__ void kernel_name(float* a, float* b, float* c) {
|
||||
int loop_e = blockIdx.x;
|
||||
float* f = a + (512*loop_e);
|
||||
float* g = c + (loop_e*262144);
|
||||
int loop_h = blockIdx.y;
|
||||
float* i = b + (((loop_h/8)*8)+(loop_h%8));
|
||||
float* j = g + (((loop_h/8)*4096)+((loop_h%8)*512));
|
||||
int loop_k = threadIdx.x;
|
||||
float* l = i + (512*loop_k);
|
||||
float* m = f + loop_k;
|
||||
float* n = j + loop_k;
|
||||
float o = *l * *m;
|
||||
*n = o;
|
||||
}
|
||||
|
||||
@@ -40,6 +40,12 @@ fn main() {
|
||||
with_autoreleasepool(|| {
|
||||
#[cfg(feature = "cuda")]
|
||||
println!("CUDA MODE ENABLED");
|
||||
|
||||
#[cfg(feature = "metal")]
|
||||
let arch = GPUArch::Metal(HashMap::default());
|
||||
#[cfg(feature = "cuda")]
|
||||
let arch = GPUArch::CUDA;
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
let (M, K, N) = (512, 512, 512);
|
||||
let mut cx = Graph::new();
|
||||
@@ -52,12 +58,8 @@ fn main() {
|
||||
let graph = new_graph.node_weight_mut(graph_node).unwrap();
|
||||
// luminal_2::debug::display_graph(&graph);
|
||||
let inputs = make_test_inputs(graph, &cx.dyn_map, &accs);
|
||||
#[cfg(feature = "metal")]
|
||||
let arch = GPUArch::Metal(HashMap::default());
|
||||
#[cfg(feature = "cuda")]
|
||||
let arch = GPUArch::CUDA;
|
||||
|
||||
let searched_graph = search(graph, 3, &inputs, arch, &cx.dyn_map).unwrap();
|
||||
let searched_graph = search(graph, 3, &inputs, arch.clone(), &cx.dyn_map).unwrap();
|
||||
// adjust meta-edges
|
||||
let old_output = graph.externals(Direction::Outgoing).next().unwrap();
|
||||
let new_output = searched_graph
|
||||
@@ -120,14 +122,8 @@ fn main() {
|
||||
for (k, v) in mapping {
|
||||
unified_map.insert(k, meta_to_final[&v]);
|
||||
}
|
||||
let (kernels, gmem_mapping) = codegen(
|
||||
graph.clone(),
|
||||
outputs,
|
||||
GPUArch::Metal(HashMap::default()),
|
||||
0,
|
||||
&HashMap::default(),
|
||||
)
|
||||
.unwrap();
|
||||
let (kernels, gmem_mapping) =
|
||||
codegen(graph.clone(), outputs, arch, 0, &HashMap::default()).unwrap();
|
||||
|
||||
let compiled = compile_kernels(&kernels);
|
||||
let (int_buffers, int_buffer_map) = assign_buffers(&kernels);
|
||||
@@ -163,15 +159,34 @@ fn main() {
|
||||
}
|
||||
}
|
||||
|
||||
let (outputs, _) = run_graph(
|
||||
&graph,
|
||||
&mut inputs,
|
||||
&kernels,
|
||||
&FxHashMap::default(),
|
||||
&compiled,
|
||||
&int_buffers,
|
||||
&int_buffer_map,
|
||||
);
|
||||
println!("DIDYOU REACH HERE?");
|
||||
|
||||
let (outputs, _) = {
|
||||
#[cfg(feature = "metal")]
|
||||
{
|
||||
run_graph(
|
||||
&graph,
|
||||
&mut inputs,
|
||||
&kernels,
|
||||
&FxHashMap::default(),
|
||||
&compiled,
|
||||
&int_buffers,
|
||||
&int_buffer_map,
|
||||
)
|
||||
}
|
||||
|
||||
#[cfg(feature = "cuda")]
|
||||
{
|
||||
run_graph(
|
||||
&mut inputs,
|
||||
&kernels,
|
||||
&FxHashMap::default(),
|
||||
&compiled,
|
||||
&int_buffers,
|
||||
&int_buffer_map,
|
||||
)
|
||||
}
|
||||
};
|
||||
println!("{:?}", ©_buffer_back(&outputs[0])[..10]);
|
||||
});
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user