Files
RustaCUDA/examples/launch.rs
2018-11-27 19:58:42 -06:00

64 lines
2.3 KiB
Rust

#[macro_use]
extern crate rustacuda;
use rustacuda::prelude::*;
use std::error::Error;
use std::ffi::CString;
fn main() -> Result<(), Box<dyn Error>> {
// Set up the context, load the module, and create a stream to run kernels in.
rustacuda::init(CudaFlags::empty())?;
let device = Device::get_device(0)?;
let _ctx = Context::create_and_push(ContextFlags::MAP_HOST | ContextFlags::SCHED_AUTO, device)?;
let ptx = CString::new(include_str!("../resources/add.ptx"))?;
let module = Module::load_from_string(&ptx)?;
let stream = Stream::new(StreamFlags::NON_BLOCKING, None)?;
// Create buffers for data
let mut in_x = DeviceBuffer::from_slice(&[1.0f32; 10])?;
let mut in_y = DeviceBuffer::from_slice(&[2.0f32; 10])?;
let mut out_1 = DeviceBuffer::from_slice(&[0.0f32; 10])?;
let mut out_2 = DeviceBuffer::from_slice(&[0.0f32; 10])?;
// This kernel adds each element in `in_x` and `in_y` and writes the result into `out`.
unsafe {
// Launch the kernel with one block of one thread, no dynamic shared memory on `stream`.
let result = launch!(module.sum<<<1, 1, 0, stream>>>(
in_x.as_device_ptr(),
in_y.as_device_ptr(),
out_1.as_device_ptr(),
out_1.len()
));
result?;
// Launch the kernel again using the `function` form:
let function_name = CString::new("sum")?;
let sum = module.get_function(&function_name)?;
// Launch with 1x1x1 (1) blocks of 10x1x1 (10) threads, to show that you can use tuples to
// configure grid and block size.
let result = launch!(sum<<<(1, 1, 1), (10, 1, 1), 0, stream>>>(
in_x.as_device_ptr(),
in_y.as_device_ptr(),
out_2.as_device_ptr(),
out_2.len()
));
result?;
}
// Kernel launches are asynchronous, so we wait for the kernels to finish executing.
stream.synchronize()?;
// Copy the results back to host memory
let mut out_host = [0.0f32; 20];
out_1.copy_to(&mut out_host[0..10])?;
out_2.copy_to(&mut out_host[10..20])?;
for x in out_host.iter() {
assert_eq!(3.0 as u32, *x as u32);
}
println!("Launched kernel successfully.");
Ok(())
}