File size: 4,176 Bytes
8324351 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
# main.py
import os
import torch
from src import (
load_data,
preprocess_data,
encode_ids,
generate_negative_samples_vectorized,
NCFModel,
train_model,
evaluate_model
)
from torch.utils.data import DataLoader
from src import InteractionDataset
def main():
# Define directories
data_dir = 'data/'
models_dir = 'models/'
outputs_dir = 'outputs/'
# Create directories if they don't exist
os.makedirs(models_dir, exist_ok=True)
os.makedirs(outputs_dir, exist_ok=True)
# Load data
data = load_data(data_dir)
# Preprocess data
catalog, relevant_events = preprocess_data(data)
# Encode IDs
interactions, user_encoder, item_encoder = encode_ids(relevant_events)
# Save encoders
import pickle
with open(os.path.join(outputs_dir, 'user_encoder.pkl'), 'wb') as f:
pickle.dump(user_encoder, f)
with open(os.path.join(outputs_dir, 'item_encoder.pkl'), 'wb') as f:
pickle.dump(item_encoder, f)
# Split data into training and testing sets
train_data, test_data = train_test_split(interactions, test_size=0.2, random_state=42)
print(f"\nTraining data shape: {train_data.shape}")
print(f"Testing data shape: {test_data.shape}")
# Generate negative samples for training
print("Generating negative samples for training...")
train_negative = generate_negative_samples_vectorized(train_data, num_negatives=4)
train_positive = train_data[['user', 'item']].copy()
train_positive['label'] = 1
train_combined = pd.concat([train_positive, train_negative], ignore_index=True)
train_combined = train_combined.sample(frac=1, random_state=42).reset_index(drop=True)
print(f"Total training samples: {train_combined.shape[0]}")
# Save negative samples
train_negative.to_pickle(os.path.join(outputs_dir, 'train_negative.pkl'))
# Generate negative samples for testing
print("Generating negative samples for testing...")
test_negative = generate_negative_samples_vectorized(test_data, num_negatives=4)
test_positive = test_data[['user', 'item']].copy()
test_positive['label'] = 1
test_combined = pd.concat([test_positive, test_negative], ignore_index=True)
test_combined = test_combined.sample(frac=1, random_state=42).reset_index(drop=True)
print(f"Total testing samples: {test_combined.shape[0]}")
# Save negative samples
test_negative.to_pickle(os.path.join(outputs_dir, 'test_negative.pkl'))
# Define Datasets and DataLoaders
train_dataset = InteractionDataset(train_combined)
test_dataset = InteractionDataset(test_combined)
train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True, num_workers=0, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False, num_workers=0, pin_memory=True)
# Instantiate the model
num_users = interactions['user'].nunique()
num_items = interactions['item'].nunique()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'\nUsing device: {device}')
model = NCFModel(num_users, num_items, embedding_size=50).to(device)
# Train the model
trained_model, metrics = train_model(
model=model,
train_loader=train_loader,
test_loader=test_loader,
device=device,
num_epochs=10,
patience=3,
learning_rate=0.001,
weight_decay=1e-5
)
# Evaluate the model
accuracy, roc_auc = evaluate_model(trained_model, test_loader, device)
# Save user_positive_items for recommendations
user_positive_items = defaultdict(set)
for row in train_data.itertuples(index=False):
user_positive_items[row.user].add(row.item)
import pickle
with open(os.path.join(outputs_dir, 'user_positive_items.pkl'), 'wb') as f:
pickle.dump(user_positive_items, f)
print("\nTraining and evaluation completed successfully.")
if __name__ == "__main__":
main()
|