From ac0fd8ff41e9f050507f0d2412d477efd2ec1f60 Mon Sep 17 00:00:00 2001 From: demian3b Date: Thu, 16 Apr 2026 23:28:47 +0900 Subject: [PATCH] Record per-run logging policy and sync non-train artifacts. Require immediate README logging plus per-attempt commits, and publish latest diagnostics/reports and trajectory-analysis utility without carrying train.py changes. Made-with: Cursor --- GUIDELINES.md | 1 + README.md | 1 + reports/latest_eval.json | 10 +-- reports/trajectory_diagnostics.json | 65 ++++++++++++++++ scripts/analyze_trajectory_quality.py | 106 ++++++++++++++++++++++++++ 5 files changed, 178 insertions(+), 5 deletions(-) create mode 100644 reports/trajectory_diagnostics.json create mode 100644 scripts/analyze_trajectory_quality.py diff --git a/GUIDELINES.md b/GUIDELINES.md index 9402b33..67080bb 100644 --- a/GUIDELINES.md +++ b/GUIDELINES.md @@ -13,6 +13,7 @@ Make overfitting robust and measurable, targeting `mean_rmsd_100 <= 1.0`. 5. When a branch is ready to land: **merge (or cherry-pick) into `main`**. The performance gate and `BEST_PRACTICE.json` / best-artifact refresh run only on **`main`** when `train.py` is part of the commit. 6. **`README.md` attempt log must also live on `main`**: if you only merged code later or abandoned a `train.py` merge, still **bring new `## Attempt Log` lines onto `main`** soon after (docs-only commit is fine—stage **only** `README.md` so the mean-RMSD gate does not run). Cherry-pick the README hunk from the branch or copy the lines; do not leave the canonical log only on a feature branch. 7. **Mandatory best-update integration**: if any feature-branch attempt records a strictly better `mean_rmsd_100` than the current `main` anchor, treat it as merge-ready work. Merge/cherry-pick it into `main` promptly (do not keep a known best only on a feature branch), then continue new experiments from a fresh branch off updated `main`. +8. **Per-attempt logging+commit is mandatory**: every experiment run must immediately (a) append its result to `README.md` and then (b) create a branch commit for that attempt before starting the next run. Do not batch multiple uncommitted runs. ## Training budget and stopping diff --git a/README.md b/README.md index 20ae8c9..b6f2373 100644 --- a/README.md +++ b/README.md @@ -92,3 +92,4 @@ This repository is intentionally pinned to CUDA 12.6 PyTorch wheels and matching - 2026-04-16: Follow-up parallel sweep (GPU0/1/2) with direct best-axis reruns produced `2.430481`, `2.412036`, and `2.720380`; observed heavy seed sensitivity and intermittent fallback-to-1000 behavior on unstable seeds. - 2026-04-16: Continued parallel sweep with rotation curriculum variants (`start=0.85/0.95` and lower-lr schedule) produced `2.450391`, `2.457748`, and `2.426384`; no improvement over branch best `2.388103`. - 2026-04-16: Deep schedule parallel sweep (`epochs=320~380`, `start=1.0` with warmup variants, multi-seed) produced `2.464117`, `2.410706`, and `2.419527`; still below branch best and showed late-epoch fallback instability in 일부 runs. +- 2026-04-16: Post-reset attempt on `attempt/s3-tail-risk-next` (trajectory-tail-risk focus) using residual-geodesic with clipped omega and scheduled rotation weight (`lr=6.8e-4`, `grad_clip=0.7`, `start=1.0`, warmup `120`) reached `mean_rmsd_100=2.464730`; no improvement. diff --git a/reports/latest_eval.json b/reports/latest_eval.json index 4df76ed..214c668 100644 --- a/reports/latest_eval.json +++ b/reports/latest_eval.json @@ -1,12 +1,12 @@ { - "mean_rmsd_100": 2.4887725603580475, + "mean_rmsd_100": 2.464729991555214, "num_runs": 100, - "timestamp_utc": "2026-04-16T13:34:16.784868+00:00", - "command": "train.py --sdf /data/demian_dev/toy/sample.sdf --epochs 280 --batch-size 24 --model-type gcn --hidden 512 --gcn-layers 8 --eval-runs 100 --loss-domain displacement --weight-center 0.8 --weight-omega 2.0 --weight-torsion 3.0 --grad-clip 0.8 --rotation-loss geodesic --gcn-residual --lr 7e-4 --seed 1", + "timestamp_utc": "2026-04-16T14:27:02.733292+00:00", + "command": "train.py --sdf /data/demian_dev/toy/sample.sdf --epochs 320 --batch-size 24 --model-type gcn --hidden 512 --gcn-layers 8 --eval-runs 100 --loss-domain displacement --weight-center 0.8 --weight-omega 2.0 --weight-torsion 3.0 --grad-clip 0.7 --rotation-loss geodesic --gcn-residual --lr 6.8e-4 --omega-max-norm 5.0 --rotation-weight-start 1.0 --rotation-weight-warmup-epochs 120 --seed 1", "notes": "Final test metric from 100 random-initialized rollouts to time=1.", - "best_train_mse": 3.5625081062316895, + "best_train_mse": 3.5417799949645996, "model_source": "best_train_checkpoint", "checkpoint_path": "/data/demian_dev/toy/ai_rfm/artifacts/latest_eval_best_model.pt", "stop_reason": "max_epochs", - "stop_epoch": 279 + "stop_epoch": 319 } \ No newline at end of file diff --git a/reports/trajectory_diagnostics.json b/reports/trajectory_diagnostics.json new file mode 100644 index 0000000..f2d5c17 --- /dev/null +++ b/reports/trajectory_diagnostics.json @@ -0,0 +1,65 @@ +{ + "summary": { + "num_files": 6, + "mean_of_mean_pair_delta_final": 0.09374758909572645, + "mean_of_max_pair_delta_final": 0.5447731928553062, + "max_of_max_pair_delta_peak": 1.2531537187469057, + "worst_file_by_peak_delta": "trajectory_02.sdf" + }, + "files": [ + { + "file": "trajectory_00.sdf", + "num_frames": 100, + "mean_atom_disp_final": 5.811090559694248, + "mean_pair_delta_final": 0.12735449847546665, + "max_pair_delta_final": 0.6744208078944323, + "mean_pair_delta_peak": 0.12735449847546665, + "max_pair_delta_peak": 0.6744208078944323 + }, + { + "file": "trajectory_01.sdf", + "num_frames": 100, + "mean_atom_disp_final": 3.6108782907891968, + "mean_pair_delta_final": 0.03265044784489361, + "max_pair_delta_final": 0.1503832748237155, + "mean_pair_delta_peak": 0.03265044784489361, + "max_pair_delta_peak": 0.18550951823859751 + }, + { + "file": "trajectory_02.sdf", + "num_frames": 100, + "mean_atom_disp_final": 3.2025008560534856, + "mean_pair_delta_final": 0.21092013840082602, + "max_pair_delta_final": 1.2531537187469057, + "mean_pair_delta_peak": 0.21092013840082602, + "max_pair_delta_peak": 1.2531537187469057 + }, + { + "file": "trajectory_03.sdf", + "num_frames": 100, + "mean_atom_disp_final": 3.9383143201666133, + "mean_pair_delta_final": 0.06427207313228954, + "max_pair_delta_final": 0.3494554453859795, + "mean_pair_delta_peak": 0.06427207313228954, + "max_pair_delta_peak": 0.3494554453859795 + }, + { + "file": "trajectory_04.sdf", + "num_frames": 100, + "mean_atom_disp_final": 2.571475460177569, + "mean_pair_delta_final": 0.03944537811959266, + "max_pair_delta_final": 0.2589912236344194, + "mean_pair_delta_peak": 0.03944537811959266, + "max_pair_delta_peak": 0.2589912236344194 + }, + { + "file": "trajectory_05.sdf", + "num_frames": 100, + "mean_atom_disp_final": 4.85538013425731, + "mean_pair_delta_final": 0.0878429986012902, + "max_pair_delta_final": 0.582234686646385, + "mean_pair_delta_peak": 0.0878429986012902, + "max_pair_delta_peak": 0.5830873173627418 + } + ] +} diff --git a/scripts/analyze_trajectory_quality.py b/scripts/analyze_trajectory_quality.py new file mode 100644 index 0000000..e447d85 --- /dev/null +++ b/scripts/analyze_trajectory_quality.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import json +from pathlib import Path + +import numpy as np +from rdkit import Chem + + +def coords_from_mol(mol: Chem.Mol) -> np.ndarray: + conf = mol.GetConformer() + n = mol.GetNumAtoms() + out = np.zeros((n, 3), dtype=np.float64) + for i in range(n): + p = conf.GetAtomPosition(i) + out[i] = [p.x, p.y, p.z] + return out + + +def pairwise_dist(coords: np.ndarray) -> np.ndarray: + diff = coords[:, None, :] - coords[None, :, :] + d = np.sqrt(np.sum(diff * diff, axis=-1)) + return d + + +def analyze_file(path: Path) -> dict: + suppl = Chem.SDMolSupplier(str(path), removeHs=False) + mols = [m for m in suppl if m is not None] + if len(mols) < 2: + return {"file": path.name, "error": "not_enough_frames"} + + frames = [coords_from_mol(m) for m in mols] + frame0 = frames[0] + d0 = pairwise_dist(frame0) + iu = np.triu_indices(frame0.shape[0], 1) + d0_flat = d0[iu] + + mean_atom_disp = [] + mean_pair_delta = [] + max_pair_delta = [] + for c in frames: + disp = np.linalg.norm(c - frame0, axis=1) + mean_atom_disp.append(float(np.mean(disp))) + d = pairwise_dist(c)[iu] + delta = np.abs(d - d0_flat) + mean_pair_delta.append(float(np.mean(delta))) + max_pair_delta.append(float(np.max(delta))) + + return { + "file": path.name, + "num_frames": len(frames), + "mean_atom_disp_final": mean_atom_disp[-1], + "mean_pair_delta_final": mean_pair_delta[-1], + "max_pair_delta_final": max_pair_delta[-1], + "mean_pair_delta_peak": float(np.max(mean_pair_delta)), + "max_pair_delta_peak": float(np.max(max_pair_delta)), + } + + +def main() -> int: + parser = argparse.ArgumentParser() + parser.add_argument( + "--traj-dir", + type=str, + default="reports/trajectories", + help="Directory with trajectory_*.sdf files.", + ) + parser.add_argument( + "--out", + type=str, + default="reports/trajectory_diagnostics.json", + help="Output JSON path.", + ) + args = parser.parse_args() + + traj_dir = Path(args.traj_dir) + files = sorted(traj_dir.glob("trajectory_*.sdf")) + if not files: + raise SystemExit(f"No trajectory files found in {traj_dir}") + + rows = [analyze_file(p) for p in files] + valid = [r for r in rows if "error" not in r] + summary = {} + if valid: + summary = { + "num_files": len(valid), + "mean_of_mean_pair_delta_final": float(np.mean([r["mean_pair_delta_final"] for r in valid])), + "mean_of_max_pair_delta_final": float(np.mean([r["max_pair_delta_final"] for r in valid])), + "max_of_max_pair_delta_peak": float(np.max([r["max_pair_delta_peak"] for r in valid])), + "worst_file_by_peak_delta": max(valid, key=lambda r: r["max_pair_delta_peak"])["file"], + } + + payload = {"summary": summary, "files": rows} + out_path = Path(args.out) + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8") + print(json.dumps(summary, indent=2)) + print(f"Wrote diagnostics: {out_path}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) +