done
Browse files- .gitignore +101 -0
- main.py +115 -0
- model.py +45 -0
- requirements.tx +7 -0
.gitignore
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
env/
|
12 |
+
venv/
|
13 |
+
ENV/
|
14 |
+
build/
|
15 |
+
develop-eggs/
|
16 |
+
dist/
|
17 |
+
downloads/
|
18 |
+
eggs/
|
19 |
+
.eggs/
|
20 |
+
lib/
|
21 |
+
lib64/
|
22 |
+
parts/
|
23 |
+
sdist/
|
24 |
+
var/
|
25 |
+
*.egg-info/
|
26 |
+
.installed.cfg
|
27 |
+
*.egg
|
28 |
+
|
29 |
+
# PyInstaller
|
30 |
+
*.manifest
|
31 |
+
*.spec
|
32 |
+
|
33 |
+
# Installer logs
|
34 |
+
pip-log.txt
|
35 |
+
pip-delete-this-directory.txt
|
36 |
+
|
37 |
+
# Unit test / coverage reports
|
38 |
+
htmlcov/
|
39 |
+
.tox/
|
40 |
+
.nox/
|
41 |
+
.coverage
|
42 |
+
.coverage.*
|
43 |
+
.cache
|
44 |
+
nosetests.xml
|
45 |
+
coverage.xml
|
46 |
+
*.cover
|
47 |
+
.hypothesis/
|
48 |
+
.pytest_cache/
|
49 |
+
|
50 |
+
# Translations
|
51 |
+
*.mo
|
52 |
+
*.pot
|
53 |
+
|
54 |
+
# Django stuff:
|
55 |
+
*.log
|
56 |
+
local_settings.py
|
57 |
+
db.sqlite3
|
58 |
+
|
59 |
+
# Flask stuff:
|
60 |
+
instance/
|
61 |
+
.webassets-cache
|
62 |
+
|
63 |
+
# Scrapy stuff:
|
64 |
+
.scrapy
|
65 |
+
|
66 |
+
# Sphinx documentation
|
67 |
+
docs/_build/
|
68 |
+
|
69 |
+
# PyBuilder
|
70 |
+
target/
|
71 |
+
|
72 |
+
# Jupyter Notebook
|
73 |
+
.ipynb_checkpoints
|
74 |
+
|
75 |
+
# IPython
|
76 |
+
profile_default/
|
77 |
+
ipython_config.py
|
78 |
+
|
79 |
+
# PyCharm
|
80 |
+
.idea/
|
81 |
+
|
82 |
+
# VS Code
|
83 |
+
.vscode/
|
84 |
+
|
85 |
+
# Mac
|
86 |
+
.DS_Store
|
87 |
+
|
88 |
+
# Windows
|
89 |
+
Thumbs.db
|
90 |
+
ehthumbs.db
|
91 |
+
Desktop.ini
|
92 |
+
|
93 |
+
# Outputs
|
94 |
+
models/
|
95 |
+
outputs/
|
96 |
+
|
97 |
+
# Pickle files
|
98 |
+
*.pkl
|
99 |
+
|
100 |
+
# Others
|
101 |
+
*.csv
|
main.py
ADDED
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# main.py
|
2 |
+
|
3 |
+
import os
|
4 |
+
import torch
|
5 |
+
from src import (
|
6 |
+
load_data,
|
7 |
+
preprocess_data,
|
8 |
+
encode_ids,
|
9 |
+
generate_negative_samples_vectorized,
|
10 |
+
NCFModel,
|
11 |
+
train_model,
|
12 |
+
evaluate_model
|
13 |
+
)
|
14 |
+
from torch.utils.data import DataLoader
|
15 |
+
from src import InteractionDataset
|
16 |
+
|
17 |
+
def main():
|
18 |
+
# Define directories
|
19 |
+
data_dir = 'data/'
|
20 |
+
models_dir = 'models/'
|
21 |
+
outputs_dir = 'outputs/'
|
22 |
+
|
23 |
+
# Create directories if they don't exist
|
24 |
+
os.makedirs(models_dir, exist_ok=True)
|
25 |
+
os.makedirs(outputs_dir, exist_ok=True)
|
26 |
+
|
27 |
+
# Load data
|
28 |
+
data = load_data(data_dir)
|
29 |
+
|
30 |
+
# Preprocess data
|
31 |
+
catalog, relevant_events = preprocess_data(data)
|
32 |
+
|
33 |
+
# Encode IDs
|
34 |
+
interactions, user_encoder, item_encoder = encode_ids(relevant_events)
|
35 |
+
|
36 |
+
# Save encoders
|
37 |
+
import pickle
|
38 |
+
with open(os.path.join(outputs_dir, 'user_encoder.pkl'), 'wb') as f:
|
39 |
+
pickle.dump(user_encoder, f)
|
40 |
+
|
41 |
+
with open(os.path.join(outputs_dir, 'item_encoder.pkl'), 'wb') as f:
|
42 |
+
pickle.dump(item_encoder, f)
|
43 |
+
|
44 |
+
# Split data into training and testing sets
|
45 |
+
train_data, test_data = train_test_split(interactions, test_size=0.2, random_state=42)
|
46 |
+
print(f"\nTraining data shape: {train_data.shape}")
|
47 |
+
print(f"Testing data shape: {test_data.shape}")
|
48 |
+
|
49 |
+
# Generate negative samples for training
|
50 |
+
print("Generating negative samples for training...")
|
51 |
+
train_negative = generate_negative_samples_vectorized(train_data, num_negatives=4)
|
52 |
+
train_positive = train_data[['user', 'item']].copy()
|
53 |
+
train_positive['label'] = 1
|
54 |
+
train_combined = pd.concat([train_positive, train_negative], ignore_index=True)
|
55 |
+
train_combined = train_combined.sample(frac=1, random_state=42).reset_index(drop=True)
|
56 |
+
print(f"Total training samples: {train_combined.shape[0]}")
|
57 |
+
|
58 |
+
# Save negative samples
|
59 |
+
train_negative.to_pickle(os.path.join(outputs_dir, 'train_negative.pkl'))
|
60 |
+
|
61 |
+
# Generate negative samples for testing
|
62 |
+
print("Generating negative samples for testing...")
|
63 |
+
test_negative = generate_negative_samples_vectorized(test_data, num_negatives=4)
|
64 |
+
test_positive = test_data[['user', 'item']].copy()
|
65 |
+
test_positive['label'] = 1
|
66 |
+
test_combined = pd.concat([test_positive, test_negative], ignore_index=True)
|
67 |
+
test_combined = test_combined.sample(frac=1, random_state=42).reset_index(drop=True)
|
68 |
+
print(f"Total testing samples: {test_combined.shape[0]}")
|
69 |
+
|
70 |
+
# Save negative samples
|
71 |
+
test_negative.to_pickle(os.path.join(outputs_dir, 'test_negative.pkl'))
|
72 |
+
|
73 |
+
# Define Datasets and DataLoaders
|
74 |
+
train_dataset = InteractionDataset(train_combined)
|
75 |
+
test_dataset = InteractionDataset(test_combined)
|
76 |
+
|
77 |
+
train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True, num_workers=0, pin_memory=True)
|
78 |
+
test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False, num_workers=0, pin_memory=True)
|
79 |
+
|
80 |
+
# Instantiate the model
|
81 |
+
num_users = interactions['user'].nunique()
|
82 |
+
num_items = interactions['item'].nunique()
|
83 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
84 |
+
print(f'\nUsing device: {device}')
|
85 |
+
|
86 |
+
model = NCFModel(num_users, num_items, embedding_size=50).to(device)
|
87 |
+
|
88 |
+
# Train the model
|
89 |
+
trained_model, metrics = train_model(
|
90 |
+
model=model,
|
91 |
+
train_loader=train_loader,
|
92 |
+
test_loader=test_loader,
|
93 |
+
device=device,
|
94 |
+
num_epochs=10,
|
95 |
+
patience=3,
|
96 |
+
learning_rate=0.001,
|
97 |
+
weight_decay=1e-5
|
98 |
+
)
|
99 |
+
|
100 |
+
# Evaluate the model
|
101 |
+
accuracy, roc_auc = evaluate_model(trained_model, test_loader, device)
|
102 |
+
|
103 |
+
# Save user_positive_items for recommendations
|
104 |
+
user_positive_items = defaultdict(set)
|
105 |
+
for row in train_data.itertuples(index=False):
|
106 |
+
user_positive_items[row.user].add(row.item)
|
107 |
+
|
108 |
+
import pickle
|
109 |
+
with open(os.path.join(outputs_dir, 'user_positive_items.pkl'), 'wb') as f:
|
110 |
+
pickle.dump(user_positive_items, f)
|
111 |
+
|
112 |
+
print("\nTraining and evaluation completed successfully.")
|
113 |
+
|
114 |
+
if __name__ == "__main__":
|
115 |
+
main()
|
model.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# src/model.py
|
2 |
+
|
3 |
+
import torch
|
4 |
+
import torch.nn as nn
|
5 |
+
|
6 |
+
class NCFModel(nn.Module):
|
7 |
+
def __init__(self, num_users, num_items, embedding_size=50):
|
8 |
+
"""
|
9 |
+
Initialize the NCF model with embedding layers and fully connected layers.
|
10 |
+
|
11 |
+
Args:
|
12 |
+
num_users (int): Total number of unique users.
|
13 |
+
num_items (int): Total number of unique items.
|
14 |
+
embedding_size (int): Size of the embedding vectors.
|
15 |
+
"""
|
16 |
+
super(NCFModel, self).__init__()
|
17 |
+
self.user_embedding = nn.Embedding(num_users, embedding_size)
|
18 |
+
self.item_embedding = nn.Embedding(num_items, embedding_size)
|
19 |
+
|
20 |
+
self.fc1 = nn.Linear(embedding_size * 2, 128)
|
21 |
+
self.dropout1 = nn.Dropout(0.5)
|
22 |
+
self.fc2 = nn.Linear(128, 64)
|
23 |
+
self.dropout2 = nn.Dropout(0.5)
|
24 |
+
self.output_layer = nn.Linear(64, 1)
|
25 |
+
|
26 |
+
def forward(self, user, item):
|
27 |
+
"""
|
28 |
+
Forward pass through the model.
|
29 |
+
|
30 |
+
Args:
|
31 |
+
user (torch.LongTensor): Tensor of user IDs.
|
32 |
+
item (torch.LongTensor): Tensor of item IDs.
|
33 |
+
|
34 |
+
Returns:
|
35 |
+
torch.Tensor: Output logits indicating interaction likelihood.
|
36 |
+
"""
|
37 |
+
user_emb = self.user_embedding(user)
|
38 |
+
item_emb = self.item_embedding(item)
|
39 |
+
x = torch.cat([user_emb, item_emb], dim=1)
|
40 |
+
x = torch.relu(self.fc1(x))
|
41 |
+
x = self.dropout1(x)
|
42 |
+
x = torch.relu(self.fc2(x))
|
43 |
+
x = self.dropout2(x)
|
44 |
+
x = self.output_layer(x) # No sigmoid here; handled in loss function
|
45 |
+
return x.squeeze()
|
requirements.tx
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pandas==1.5.3
|
2 |
+
numpy==1.25.2
|
3 |
+
scikit-learn==1.2.2
|
4 |
+
matplotlib==3.7.2
|
5 |
+
seaborn==0.12.2
|
6 |
+
torch==2.0.1
|
7 |
+
tqdm==4.65.0
|