Spaces:
Sleeping
Sleeping
File size: 5,961 Bytes
022acf4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
import gc
import os
import time
from typing import Tuple
import numpy as np
from tqdm.auto import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import wandb
from newsclassifier.config.config import Cfg, logger
from newsclassifier.data import (NewsDataset, collate, data_split,
load_dataset, preprocess)
from newsclassifier.models import CustomModel
from torch.utils.data import DataLoader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def train_step(train_loader: DataLoader, model, num_classes: int, loss_fn, optimizer, epoch: int) -> float:
"""Train step."""
model.train()
loss = 0.0
total_iterations = len(train_loader)
desc = f"Training - Epoch {epoch+1}"
for step, (inputs, labels) in tqdm(enumerate(train_loader), total=total_iterations, desc=desc):
inputs = collate(inputs)
for k, v in inputs.items():
inputs[k] = v.to(device)
labels = labels.to(device)
optimizer.zero_grad() # reset gradients
y_pred = model(inputs) # forward pass
targets = F.one_hot(labels.long(), num_classes=num_classes).float() # one-hot (for loss_fn)
J = loss_fn(y_pred, targets) # define loss
J.backward() # backward pass
optimizer.step() # update weights
loss += (J.detach().item() - loss) / (step + 1) # cumulative loss
return loss
def eval_step(val_loader: DataLoader, model, num_classes: int, loss_fn, epoch: int) -> Tuple[float, np.ndarray, np.ndarray]:
"""Eval step."""
model.eval()
loss = 0.0
total_iterations = len(val_loader)
desc = f"Validation - Epoch {epoch+1}"
y_trues, y_preds = [], []
with torch.inference_mode():
for step, (inputs, labels) in tqdm(enumerate(val_loader), total=total_iterations, desc=desc):
inputs = collate(inputs)
for k, v in inputs.items():
inputs[k] = v.to(device)
labels = labels.to(device)
y_pred = model(inputs)
targets = F.one_hot(labels.long(), num_classes=num_classes).float() # one-hot (for loss_fn)
J = loss_fn(y_pred, targets).item()
loss += (J - loss) / (step + 1)
y_trues.extend(targets.cpu().numpy())
y_preds.extend(torch.argmax(y_pred, dim=1).cpu().numpy())
return loss, np.vstack(y_trues), np.vstack(y_preds)
def train_loop(config=None):
# ====================================================
# loader
# ====================================================
config = dict(
batch_size=Cfg.batch_size,
num_classes=Cfg.num_classes,
epochs=Cfg.epochs,
dropout_pb=Cfg.dropout_pb,
learning_rate=Cfg.lr,
lr_reduce_factor=Cfg.lr_redfactor,
lr_reduce_patience=Cfg.lr_redpatience,
)
with wandb.init(project="NewsClassifier", config=config):
config = wandb.config
df = load_dataset(Cfg.dataset_loc)
ds, headlines_df, class_to_index, index_to_class = preprocess(df)
train_ds, val_ds = data_split(ds, test_size=Cfg.test_size)
logger.info("Preparing Data.")
train_dataset = NewsDataset(train_ds)
valid_dataset = NewsDataset(val_ds)
train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=4, pin_memory=True, drop_last=True)
valid_loader = DataLoader(valid_dataset, batch_size=config.batch_size, shuffle=False, num_workers=4, pin_memory=True, drop_last=False)
# ====================================================
# model
# ====================================================
logger.info("Creating Custom Model.")
num_classes = config.num_classes
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CustomModel(num_classes=num_classes, dropout_pb=config.dropout_pb)
model.to(device)
# ====================================================
# Training components
# ====================================================
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
optimizer, mode="min", factor=config.lr_reduce_factor, patience=config.lr_reduce_patience
)
# ====================================================
# loop
# ====================================================
wandb.watch(model, criterion, log="all", log_freq=10)
min_loss = np.inf
logger.info("Staring Training Loop.")
for epoch in range(config.epochs):
try:
start_time = time.time()
# Step
train_loss = train_step(train_loader, model, num_classes, criterion, optimizer, epoch)
val_loss, _, _ = eval_step(valid_loader, model, num_classes, criterion, epoch)
scheduler.step(val_loss)
# scoring
elapsed = time.time() - start_time
wandb.log({"epoch": epoch + 1, "train_loss": train_loss, "val_loss": val_loss})
print(f"Epoch {epoch+1} - avg_train_loss: {train_loss:.4f} avg_val_loss: {val_loss:.4f} time: {elapsed:.0f}s")
if min_loss > val_loss:
min_loss = val_loss
print("Best Score : saving model.")
os.makedirs(Cfg.artifacts_path, exist_ok=True)
model.save(Cfg.artifacts_path)
print(f"\nSaved Best Model Score: {min_loss:.4f}\n\n")
except Exception as e:
logger.error(f"Epoch - {epoch+1}, {e}")
wandb.save(os.path.join(Cfg.artifacts_path, "model.pt"))
torch.cuda.empty_cache()
gc.collect()
if __name__ == "__main__":
train_loop()
|