Usage Examples¶
This page provides several complete examples demonstrating the main features of arch_eval.
Basic Training with MNIST¶
Train a simple CNN on MNIST using torchvision data.
import torch
import torch.nn as nn
import torch.nn.functional as F
from arch_eval import Trainer, TrainingConfig
from torchvision import transforms
# ---------- Model ----------
class SimpleCNN(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(1, 32, 3, 1)
self.conv2 = nn.Conv2d(32, 64, 3, 1)
self.fc1 = nn.Linear(9216, 128)
self.fc2 = nn.Linear(128, 10)
def forward(self, x):
x = F.relu(self.conv1(x))
x = F.relu(self.conv2(x))
x = F.max_pool2d(x, 2)
x = torch.flatten(x, 1)
x = F.relu(self.fc1(x))
return self.fc2(x)
# ---------- Configuration ----------
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])
config = TrainingConfig(
dataset="mnist",
dataset_params={"root": "./data", "split": "train", "download": True},
transform=transform,
training_args={
"batch_size": 64,
"learning_rate": 0.001,
"num_epochs": 5,
},
task="classification",
device="cuda" if torch.cuda.is_available() else "cpu",
realtime=True,
save_plot=["loss", "accuracy"],
log_to_wandb=False,
seed=42,
)
# ---------- Train ----------
model = SimpleCNN()
trainer = Trainer(model, config)
history = trainer.train()
print(f"Final validation accuracy: {history['val_accuracy'][-1]:.4f}")
Benchmarking Two MLP Variants¶
Compare a small and a large MLP on synthetic data.
import torch.nn as nn
from arch_eval import Benchmark, BenchmarkConfig
class MLP(nn.Module):
def __init__(self, hidden=256):
super().__init__()
self.net = nn.Sequential(
nn.Linear(128, hidden),
nn.ReLU(),
nn.Linear(hidden, 64)
)
def forward(self, x):
return self.net(x)
models = [
{"name": "Small MLP", "model": MLP(hidden=128)},
{"name": "Large MLP", "model": MLP(hidden=512)},
]
config = BenchmarkConfig(
dataset="synthetic classification",
dataset_params={
"n_samples": 5000,
"n_features": 128,
"n_classes": 64,
"n_informative": 64,
},
training_args={
"batch_size": 32,
"learning_rate": 0.001,
"num_epochs": 10,
},
compare_metrics=["accuracy", "loss"],
parallel=True, # run two models concurrently
use_processes=False, # use threads (safe for CPU; for GPU, keep sequential)
device="cpu", # force CPU for this example
)
benchmark = Benchmark(models, config)
results = benchmark.run()
print(results)
Hyperparameter Search with Random Search¶
Optimize learning rate and hidden size for a regression model.
import numpy as np
import torch.nn as nn
from arch_eval import HyperparameterOptimizer, TrainingConfig
class Regressor(nn.Module):
def __init__(self, hidden=64):
super().__init__()
self.net = nn.Sequential(
nn.Linear(20, hidden),
nn.ReLU(),
nn.Linear(hidden, 1)
)
def forward(self, x):
return self.net(x)
def model_fn():
return Regressor()
base_config = TrainingConfig(
dataset="synthetic regression",
dataset_params={
"n_samples": 2000,
"n_features": 20,
"noise": 0.1,
},
training_args={
"num_epochs": 5,
"batch_size": 32,
},
task="regression",
realtime=False, # disable plots during search
)
param_grid = {
"learning_rate": [0.0001, 0.001, 0.01, 0.1],
"hidden": [32, 64, 128],
}
optimizer = HyperparameterOptimizer(
model_fn,
base_config,
param_grid,
search_type="random",
n_trials=6, # try 6 random combinations
metric="val_mse",
mode="min",
)
results = optimizer.run()
print("Best trial:")
print(results.loc[results["val_mse"].idxmin()])
Using Callbacks – Early Stopping and Checkpointing¶
Train a model with early stopping and model checkpointing.
from arch_eval import Trainer, TrainingConfig
from arch_eval import EarlyStopping, ModelCheckpoint
config = TrainingConfig(
dataset="synthetic classification",
dataset_params={"n_samples": 1000, "n_features": 20, "n_classes": 5},
training_args={"num_epochs": 50, "batch_size": 32, "learning_rate": 0.01},
task="classification",
callbacks=[
EarlyStopping(monitor="val_loss", patience=5, mode="min"),
ModelCheckpoint(
filepath="./checkpoints/model-{epoch:02d}.pt",
monitor="val_accuracy",
save_best_only=True,
mode="max"
)
],
checkpoint_dir="./checkpoints",
)
model = nn.Linear(20, 5)
trainer = Trainer(model, config)
history = trainer.train()
Custom Dataset from NumPy Arrays¶
Use your own data stored as NumPy arrays.
import numpy as np
import torch
from arch_eval import Trainer, TrainingConfig
# Generate random data
X = np.random.randn(1000, 50).astype(np.float32)
y = (X.sum(axis=1) > 0).astype(np.int64) # binary labels
config = TrainingConfig(
dataset=(X, y), # tuple (data, targets)
training_args={"batch_size": 64, "learning_rate": 0.001, "num_epochs": 5},
task="classification",
)
model = torch.nn.Linear(50, 2) # 2 classes
trainer = Trainer(model, config)
trainer.train()
Distributed Training with DDP¶
Launch script using torchrun. Assume the script is train_ddp.py.
# train_ddp.py
import torch
import torch.nn as nn
import torch.distributed as dist
from arch_eval import Trainer, TrainingConfig, DistributedBackend
class Model(nn.Module):
def __init__(self):
super().__init__()
self.fc = nn.Linear(128, 10)
def forward(self, x):
return self.fc(x)
# Get rank and world size from environment (set by torchrun)
rank = int(os.environ["RANK"])
world_size = int(os.environ["WORLD_SIZE"])
config = TrainingConfig(
dataset="synthetic classification",
dataset_params={"n_samples": 10000, "n_features": 128, "n_classes": 10},
training_args={"batch_size": 64, "num_epochs": 10, "learning_rate": 0.01},
distributed_backend=DistributedBackend.DISTRIBUTED,
distributed_world_size=world_size,
distributed_rank=rank,
# Optional: shard dataset so each GPU sees different samples
dataset_shard={"num_shards": world_size, "shard_id": rank},
device="cuda",
)
model = Model()
trainer = Trainer(model, config)
trainer.train()
Run with:
torchrun --nproc_per_node=2 train_ddp.py
Profiling and Video Recording¶
Enable the profiler and record a video of the loss curve.
config = TrainingConfig(
dataset="synthetic classification",
dataset_params={"n_samples": 1000, "n_features": 20, "n_classes": 5},
training_args={"num_epochs": 5},
profiler={
"enabled": True,
"activities": ["cpu", "cuda"],
"schedule": {"wait": 1, "warmup": 1, "active": 2},
"trace_path": "./profiler_trace"
},
save_video=["loss"], # record loss over time
realtime=False, # disable live window (optional)
)
model = nn.Linear(20, 5)
trainer = Trainer(model, config)
trainer.train()
# After training, a video file `loss.mp4` will be created (if ffmpeg is installed).
Using a HuggingFace Dataset¶
Load the IMDB dataset from Hugging Face and train a simple text classifier.
from datasets import load_dataset
from arch_eval import Trainer, TrainingConfig
import torch.nn as nn
# Load dataset
dataset = load_dataset("imdb")
class TextClassifier(nn.Module):
def __init__(self, vocab_size=10000, embed_dim=128, num_classes=2):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim)
self.fc = nn.Linear(embed_dim, num_classes)
def forward(self, input_ids):
# input_ids: (batch, seq_len)
emb = self.embedding(input_ids).mean(dim=1) # average pooling
return self.fc(emb)
config = TrainingConfig(
dataset=dataset["train"], # pass the dataset object
dataset_streaming=False, # set to True for IterableDataset
training_args={"batch_size": 16, "num_epochs": 1},
task="classification",
)
model = TextClassifier()
trainer = Trainer(model, config)
trainer.train()
(Note: This is a simplified example; real text classification requires proper tokenization and possibly a collate function.)
Custom Callback – Logging to File¶
Create a callback that writes metrics to a CSV file.
import csv
from arch_eval import Callback
class CSVLogger(Callback):
def __init__(self, filename="log.csv"):
self.filename = filename
self.file = open(filename, "w", newline="")
self.writer = None
def on_log(self, trainer, metrics, step):
if self.writer is None:
self.writer = csv.DictWriter(self.file, fieldnames=["step"] + list(metrics.keys()))
self.writer.writeheader()
row = {"step": step, **metrics}
self.writer.writerow(row)
self.file.flush()
def on_train_end(self, trainer):
self.file.close()
# Use it
config = TrainingConfig(
...,
callbacks=[CSVLogger("training_log.csv")]
)
Using the Plugin System¶
Create a simple plugin that prints a message at the start of each epoch.
File: my_plugin.py (place it in your Python path)
from arch_eval.plugins import hook
@hook("on_epoch_start")
def epoch_start(trainer, epoch):
print(f"Starting epoch {epoch}!")
Now run any training script; the plugin will be discovered automatically and the hook will execute.
These examples cover most of the library’s functionality. For further details, refer to the Guide and API Reference.