From c637a639532965d149c4121175f00218886ea088 Mon Sep 17 00:00:00 2001
From: demian3b <ms.choi@3billion.io>
Date: Wed, 15 Apr 2026 23:48:51 +0900
Subject: [PATCH] Update project configuration and documentation for GPU
 support; add GPU environment check script and CUDA runtime validation

---
 .gitignore                |  2 ++
 Cargo.toml                |  8 ++------
 README.md                 | 16 +++++++++++++++-
 examples/toy_pipeline.rs  | 15 +--------------
 scripts/gpu_env_check.sh  | 20 ++++++++++++++++++++
 scripts/gpu_smoke_test.py | 23 +++++++++++++++++++++++
 src/api.rs                |  1 -
 src/burn_adapter.rs       |  7 -------
 8 files changed, 63 insertions(+), 29 deletions(-)
 create mode 100755 scripts/gpu_env_check.sh
 create mode 100644 scripts/gpu_smoke_test.py

diff --git a/.gitignore b/.gitignore
index 5730393..18b22ca 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
+outputs
+
 # ---> Rust
 # Generated by Cargo
 # will have compiled files and executables
diff --git a/Cargo.toml b/Cargo.toml
index d861ee7..d3672cb 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,17 +3,13 @@ name = "riemann-flow-gnn"
 version = "0.1.0"
 edition = "2021"
 
-[features]
-default = []
-burn = ["dep:burn", "dep:safetensors"]
-
 [dependencies]
 plotters = { version = "0.3", default-features = false, features = ["bitmap_backend", "bitmap_encoder"] }
 serde = { version = "1", features = ["derive"] }
 serde_json = "1"
 thiserror = "1"
-burn = { version = "0.14", optional = true, features = ["autodiff", "ndarray"] }
-safetensors = { version = "0.4", optional = true }
+burn = { version = "0.14", features = ["autodiff", "cuda-jit"] }
+safetensors = { version = "0.4" }
 
 [dev-dependencies]
 approx = "0.5"
diff --git a/README.md b/README.md
index 58b432d..b6c77eb 100644
--- a/README.md
+++ b/README.md
@@ -13,6 +13,20 @@ Riemannian Flow matching with GNN implementation in rust/burn
 cargo run --example toy_pipeline
 ```
 
+### NVIDIA GPU 환경 점검
+
+```bash
+bash scripts/gpu_env_check.sh
+```
+
+점검 항목:
+- `nvidia-smi` 드라이버/디바이스 인식
+- `nvcc --version` CUDA toolkit 인식
+- `scripts/gpu_smoke_test.py` CUDA matmul 런타임 실행
+- `cargo check --example toy_pipeline` Burn 빌드 스모크 테스트
+
+예제 학습 스켈레톤은 **기본으로 Burn CUDA JIT 백엔드(GPU)** 를 사용합니다. NVIDIA 드라이버와 CUDA 개발 환경(`nvcc` 등)이 필요합니다.
+
 예제는 파이프라인 계약 확인용으로 동작합니다.
 
 ### 현재 포함 범위
@@ -20,7 +34,7 @@ cargo run --example toy_pipeline
 - graph/domain struct (`Atom`, `Bond`, `TorsionEdge`, `LigandGraph`)
 - one-sample entry API (`api::OneSampleToyPipeline`)
 - graph/pose/simulator 최소 계약
-- Burn tensor shape 중심 adapter skeleton (`--features burn`)
+- Burn tensor shape 중심 adapter skeleton (기본 활성)
 - TODO 기반 구현 포인트 주석
 
 ### 이후 단계 권장
diff --git a/examples/toy_pipeline.rs b/examples/toy_pipeline.rs
index f1e65e2..16a8c31 100644
--- a/examples/toy_pipeline.rs
+++ b/examples/toy_pipeline.rs
@@ -1,8 +1,6 @@
-#[cfg(feature = "burn")]
 use std::{collections::BTreeMap, marker::PhantomData, path::Path};
 use std::fs;
 
-#[cfg(feature = "burn")]
 use burn::{backend::Autodiff, tensor::backend::AutodiffBackend};
 use riemann_flow_gnn::{
     api::{OneSampleToyBatch, OneSampleToyPipeline},
@@ -12,18 +10,15 @@ use riemann_flow_gnn::{
     viz::{export_simulation_video, plot_static_png},
 };
 use serde::Deserialize;
-#[cfg(feature = "burn")]
 use riemann_flow_gnn::api::burn_training::{
     run_burn_training, BurnToyModel, BurnTrainConfig,
 };
-#[cfg(feature = "burn")]
 use safetensors::{
     serialize_to_file,
     tensor::{Dtype, TensorView},
 };
 
-#[cfg(feature = "burn")]
-type ExampleBackend = Autodiff<burn::backend::NdArray>;
+type ExampleBackend = Autodiff<burn::backend::CudaJit>;
 
 #[derive(Debug, Deserialize)]
 struct ToyAtom {
@@ -100,12 +95,10 @@ fn load_toy_reference(path: &str) -> Result<(LigandGraph, LigandConformer), Box<
     Ok((graph, reference))
 }
 
-#[cfg(feature = "burn")]
 struct ExampleBurnModel<B: AutodiffBackend> {
     _backend: PhantomData<B>,
 }
 
-#[cfg(feature = "burn")]
 impl<B: AutodiffBackend> ExampleBurnModel<B> {
     fn forward_template(&self, _graph: &riemann_flow_gnn::graph::GraphFeatures) -> Result<(), String> {
         // TODO(user): Burn tensor 입력(GraphTensors, PoseTensors) 기반 forward 구현
@@ -123,7 +116,6 @@ impl<B: AutodiffBackend> ExampleBurnModel<B> {
     }
 }
 
-#[cfg(feature = "burn")]
 impl<B: AutodiffBackend> BurnToyModel<B> for ExampleBurnModel<B> {
     fn name(&self) -> &'static str {
         "burn-toy-model"
@@ -196,7 +188,6 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
     });
     let graph_features = pipeline.graph_contract();
 
-    #[cfg(feature = "burn")]
     let history = {
         let device = Default::default();
         let mut model = ExampleBurnModel::<ExampleBackend>::new(&device);
@@ -233,16 +224,12 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
         graph_features.spec.node_feat_dim,
         graph_features.spec.edge_feat_dim
     );
-    #[cfg(feature = "burn")]
     println!(
         "burn training finished: epochs={}, last_loss={:.6}",
         history.len(),
         history.last().map(|m| m.loss).unwrap_or(0.0)
     );
-    #[cfg(feature = "burn")]
     println!("saved checkpoints: outputs/checkpoints/*.safetensors");
-    #[cfg(not(feature = "burn"))]
-    println!("burn feature is off: training skeleton skipped");
     println!("saved video: outputs/simulation.mp4");
     println!("target reference: data/toy/aspirin_reference.json");
     Ok(())
diff --git a/scripts/gpu_env_check.sh b/scripts/gpu_env_check.sh
new file mode 100755
index 0000000..b6bdab4
--- /dev/null
+++ b/scripts/gpu_env_check.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+echo "[1/4] NVIDIA driver check"
+nvidia-smi
+
+echo
+echo "[2/4] CUDA compiler check"
+nvcc --version
+
+echo
+echo "[3/4] PyTorch CUDA runtime check"
+python "scripts/gpu_smoke_test.py"
+
+echo
+echo "[4/4] Burn feature compile check"
+cargo check --example toy_pipeline
+
+echo
+echo "GPU environment check finished successfully."
diff --git a/scripts/gpu_smoke_test.py b/scripts/gpu_smoke_test.py
new file mode 100644
index 0000000..396fe74
--- /dev/null
+++ b/scripts/gpu_smoke_test.py
@@ -0,0 +1,23 @@
+import torch
+
+
+def main() -> None:
+    print(f"torch={torch.__version__}")
+    print(f"cuda_available={torch.cuda.is_available()}")
+    if not torch.cuda.is_available():
+        raise SystemExit("CUDA is not available")
+
+    device_count = torch.cuda.device_count()
+    print(f"cuda_device_count={device_count}")
+    for i in range(device_count):
+        print(f"gpu[{i}]={torch.cuda.get_device_name(i)}")
+
+    x = torch.randn((2048, 2048), device="cuda")
+    y = torch.randn((2048, 2048), device="cuda")
+    z = x @ y
+    torch.cuda.synchronize()
+    print(f"matmul_ok shape={tuple(z.shape)} dtype={z.dtype} device={z.device}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/api.rs b/src/api.rs
index 2cd8bb0..97ce86b 100644
--- a/src/api.rs
+++ b/src/api.rs
@@ -46,7 +46,6 @@ impl OneSampleToyPipeline {
     }
 }
 
-#[cfg(feature = "burn")]
 pub mod burn_training {
     use std::{
         collections::BTreeMap,
diff --git a/src/burn_adapter.rs b/src/burn_adapter.rs
index 7f7e02d..938939c 100644
--- a/src/burn_adapter.rs
+++ b/src/burn_adapter.rs
@@ -1,12 +1,8 @@
-#[cfg(feature = "burn")]
 use burn::tensor::{backend::Backend, Int, Tensor};
 
-#[cfg(feature = "burn")]
 use crate::geometry::PoseState;
-#[cfg(feature = "burn")]
 use crate::graph::GraphFeatures;
 
-#[cfg(feature = "burn")]
 pub struct GraphTensors<B: Backend> {
     pub node_features: Tensor<B, 2>,
     pub edge_index: Tensor<B, 2, Int>,
@@ -14,7 +10,6 @@ pub struct GraphTensors<B: Backend> {
     pub torsion_edge_index: Tensor<B, 2, Int>,
 }
 
-#[cfg(feature = "burn")]
 impl<B: Backend> GraphTensors<B> {
     pub fn from_features(features: &GraphFeatures, device: &B::Device) -> Self {
         Self {
@@ -35,14 +30,12 @@ impl<B: Backend> GraphTensors<B> {
     }
 }
 
-#[cfg(feature = "burn")]
 pub struct PoseTensors<B: Backend> {
     pub translation: Tensor<B, 2>,
     pub rotation: Tensor<B, 2>,
     pub torsions: Tensor<B, 2>,
 }
 
-#[cfg(feature = "burn")]
 impl<B: Backend> PoseTensors<B> {
     pub fn from_pose_batch(poses: &[PoseState], device: &B::Device) -> Self {
         let batch = poses.len();