Initialize ai_rfm documentation and cu126 uv scaffolding.
Set up project policy files, baseline best-practice tracking, and a pre-commit performance gate so future train.py commits require measured RMSD improvement. Made-with: Cursor
This commit is contained in:
8
.pre-commit-config.yaml
Normal file
8
.pre-commit-config.yaml
Normal file
@@ -0,0 +1,8 @@
|
||||
repos:
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: train-performance-gate
|
||||
name: train.py performance gate
|
||||
entry: python scripts/precommit_performance_gate.py
|
||||
language: system
|
||||
pass_filenames: false
|
||||
1
.python-version
Normal file
1
.python-version
Normal file
@@ -0,0 +1 @@
|
||||
3.10
|
||||
8
BEST_PRACTICE.json
Normal file
8
BEST_PRACTICE.json
Normal file
@@ -0,0 +1,8 @@
|
||||
{
|
||||
"best_mean_rmsd_100": 999999.0,
|
||||
"num_runs": 100,
|
||||
"timestamp_utc": "1970-01-01T00:00:00Z",
|
||||
"command": "",
|
||||
"notes": "Bootstrap placeholder. Replace after first measured run.",
|
||||
"updated_by_commit": ""
|
||||
}
|
||||
35
GUIDELINES.md
Normal file
35
GUIDELINES.md
Normal file
@@ -0,0 +1,35 @@
|
||||
# GUIDELINES
|
||||
|
||||
## Purpose
|
||||
|
||||
Make overfitting robust and measurable, targeting `mean_rmsd_100 <= 1.0`.
|
||||
|
||||
## Workflow
|
||||
|
||||
1. Modify code/config.
|
||||
2. Run training/evaluation and write `reports/latest_eval.json`.
|
||||
3. If improved, update `BEST_PRACTICE.json` in the same commit.
|
||||
4. Append one line to `README.md` attempt log.
|
||||
5. Commit.
|
||||
|
||||
## Required report format
|
||||
|
||||
`reports/latest_eval.json` must include:
|
||||
|
||||
- `mean_rmsd_100` (float, lower is better)
|
||||
- `num_runs` (int, must be 100)
|
||||
- `timestamp_utc`
|
||||
- `command`
|
||||
- `notes`
|
||||
|
||||
## Repro notes
|
||||
|
||||
- Keep seed explicit in commands.
|
||||
- Keep sample path explicit.
|
||||
- Prefer additive experiments (do not silently remove prior working options).
|
||||
|
||||
## Safety
|
||||
|
||||
- If pre-commit blocks due to no improvement on `train.py`, either:
|
||||
- improve model and re-evaluate, or
|
||||
- commit non-`train.py` changes separately.
|
||||
38
README.md
Normal file
38
README.md
Normal file
@@ -0,0 +1,38 @@
|
||||
# ai_rfm
|
||||
|
||||
RFM overfitting sandbox for a single ligand sample, with hard quality gates.
|
||||
|
||||
## Environment first (UV, cu126 only)
|
||||
|
||||
1. Ensure Python 3.12 is available.
|
||||
2. Install env and deps:
|
||||
- `uv sync`
|
||||
3. Install git hooks:
|
||||
- `uv run pre-commit install`
|
||||
|
||||
This repository is intentionally pinned to CUDA 12.6 PyTorch wheels and matching PyG wheels.
|
||||
|
||||
## Repository policy
|
||||
|
||||
- Every attempt must update this README (append a short entry in `## Attempt Log`).
|
||||
- Commits touching `train.py` must include:
|
||||
- `reports/latest_eval.json`
|
||||
- `BEST_PRACTICE.json`
|
||||
- better or equal `mean_rmsd_100` compared to previous best (enforced by pre-commit).
|
||||
|
||||
## Evaluation target
|
||||
|
||||
- Metric: mean RMSD over 100 runs (`batchsize=100` style aggregated evaluation).
|
||||
- Success criterion: `mean_rmsd_100 <= 1.0`.
|
||||
|
||||
## Key files
|
||||
|
||||
- `train.py`: training/evaluation entry point.
|
||||
- `GUIDELINES.md`: operating rules and workflow.
|
||||
- `BEST_PRACTICE.json`: current best-known metric and config.
|
||||
- `reports/latest_eval.json`: most recent measured metric.
|
||||
- `scripts/precommit_performance_gate.py`: pre-commit guard for train-related commits.
|
||||
|
||||
## Attempt Log
|
||||
|
||||
- 2026-04-16: Bootstrapped docs/environment policy and cu126 UV config. Added best-practice/performance gating scaffolding before the next training run.
|
||||
60
pyproject.toml
Normal file
60
pyproject.toml
Normal file
@@ -0,0 +1,60 @@
|
||||
[build-system]
|
||||
requires = ["setuptools>=79", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "ai-rfm"
|
||||
version = "0.1.0"
|
||||
description = "RFM overfitting playground with strict performance gating."
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10,<3.11"
|
||||
dependencies = [
|
||||
"numpy>=1.26.4",
|
||||
"rdkit>=2024.9.5",
|
||||
"biopython>=1.85",
|
||||
"hydra-core>=1.3.2",
|
||||
"transformers>=4.48.0",
|
||||
"graphein>=1.7.7",
|
||||
"esm==3.2.0",
|
||||
"e3nn>=0.5.0",
|
||||
"jaxtyping>=0.3.2",
|
||||
"mlcrate>=0.2.0",
|
||||
"omegaconf>=2.3.0",
|
||||
"mlflow>=2.0.0",
|
||||
"tqdm>=4.65.0",
|
||||
"accelerate>=1.9.0",
|
||||
"trackio>=0.2.2",
|
||||
"torchmetrics>=1.8.2",
|
||||
"tmtools>=0.3.0",
|
||||
"scikit-learn>=1.5.0",
|
||||
"torch==2.7.1+cu126",
|
||||
"torchaudio==2.7.1+cu126",
|
||||
"torchvision==0.22.1+cu126",
|
||||
"pyg_lib @ https://data.pyg.org/whl/torch-2.7.0%2Bcu126/pyg_lib-0.4.0%2Bpt27cu126-cp310-cp310-linux_x86_64.whl",
|
||||
"torch-scatter @ https://data.pyg.org/whl/torch-2.7.0%2Bcu126/torch_scatter-2.1.2%2Bpt27cu126-cp310-cp310-linux_x86_64.whl",
|
||||
"torch-sparse @ https://data.pyg.org/whl/torch-2.7.0%2Bcu126/torch_sparse-0.6.18%2Bpt27cu126-cp310-cp310-linux_x86_64.whl",
|
||||
"torch-cluster @ https://data.pyg.org/whl/torch-2.7.0%2Bcu126/torch_cluster-1.6.3%2Bpt27cu126-cp310-cp310-linux_x86_64.whl",
|
||||
"torch-spline-conv @ https://data.pyg.org/whl/torch-2.7.0%2Bcu126/torch_spline_conv-1.2.2%2Bpt27cu126-cp310-cp310-linux_x86_64.whl",
|
||||
"torch-geometric==2.6.1",
|
||||
]
|
||||
|
||||
[dependency-groups]
|
||||
dev = [
|
||||
"pre-commit>=3.8.0",
|
||||
"pytest>=8.3.4",
|
||||
"pytest-cov>=6.1.1",
|
||||
]
|
||||
|
||||
[tool.uv.sources]
|
||||
torch = [{ index = "torch-cu126" }]
|
||||
torchaudio = [{ index = "torch-cu126" }]
|
||||
torchvision = [{ index = "torch-cu126" }]
|
||||
|
||||
[[tool.uv.index]]
|
||||
name = "torch-cu126"
|
||||
url = "https://download.pytorch.org/whl/cu126"
|
||||
explicit = true
|
||||
|
||||
[tool.uv]
|
||||
default-groups = ["dev"]
|
||||
cache-keys = [{ file = "pyproject.toml" }]
|
||||
7
reports/latest_eval.json
Normal file
7
reports/latest_eval.json
Normal file
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"mean_rmsd_100": 999999.0,
|
||||
"num_runs": 100,
|
||||
"timestamp_utc": "1970-01-01T00:00:00Z",
|
||||
"command": "",
|
||||
"notes": "Bootstrap placeholder. Replace with real measured evaluation output."
|
||||
}
|
||||
83
scripts/precommit_performance_gate.py
Normal file
83
scripts/precommit_performance_gate.py
Normal file
@@ -0,0 +1,83 @@
|
||||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
BEST = ROOT / "BEST_PRACTICE.json"
|
||||
LATEST = ROOT / "reports" / "latest_eval.json"
|
||||
|
||||
|
||||
def git(*args: str) -> str:
|
||||
return subprocess.check_output(["git", *args], cwd=ROOT, text=True).strip()
|
||||
|
||||
|
||||
def staged_files() -> set[str]:
|
||||
out = git("diff", "--cached", "--name-only")
|
||||
return {line for line in out.splitlines() if line}
|
||||
|
||||
|
||||
def read_json(path: Path) -> dict:
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(path)
|
||||
return json.loads(path.read_text())
|
||||
|
||||
|
||||
def read_head_best() -> float:
|
||||
try:
|
||||
raw = git("show", "HEAD:BEST_PRACTICE.json")
|
||||
except subprocess.CalledProcessError:
|
||||
return float("inf")
|
||||
try:
|
||||
parsed = json.loads(raw)
|
||||
except json.JSONDecodeError:
|
||||
return float("inf")
|
||||
return float(parsed.get("best_mean_rmsd_100", float("inf")))
|
||||
|
||||
|
||||
def fail(msg: str) -> int:
|
||||
print(f"[pre-commit] {msg}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
def main() -> int:
|
||||
staged = staged_files()
|
||||
if "train.py" not in staged:
|
||||
return 0
|
||||
|
||||
if "reports/latest_eval.json" not in staged:
|
||||
return fail("train.py changed: stage reports/latest_eval.json too.")
|
||||
if "BEST_PRACTICE.json" not in staged:
|
||||
return fail("train.py changed: stage BEST_PRACTICE.json too.")
|
||||
|
||||
latest = read_json(LATEST)
|
||||
best = read_json(BEST)
|
||||
|
||||
latest_rmsd = float(latest["mean_rmsd_100"])
|
||||
latest_runs = int(latest["num_runs"])
|
||||
if latest_runs != 100:
|
||||
return fail(f"num_runs must be 100, got {latest_runs}.")
|
||||
|
||||
best_rmsd = float(best["best_mean_rmsd_100"])
|
||||
if abs(best_rmsd - latest_rmsd) > 1e-12:
|
||||
return fail("BEST_PRACTICE.json best_mean_rmsd_100 must match reports/latest_eval.json.")
|
||||
|
||||
previous_best = read_head_best()
|
||||
if not (latest_rmsd < previous_best):
|
||||
return fail(
|
||||
f"No improvement: latest={latest_rmsd:.6f}, previous_best={previous_best:.6f}. "
|
||||
"train.py commits require strict improvement."
|
||||
)
|
||||
|
||||
print(
|
||||
f"[pre-commit] PASS: improved mean_rmsd_100 {previous_best:.6f} -> {latest_rmsd:.6f}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user