Viraj45 commited on
Commit
8324351
·
verified ·
1 Parent(s): 66080ae
Files changed (4) hide show
  1. .gitignore +101 -0
  2. main.py +115 -0
  3. model.py +45 -0
  4. requirements.tx +7 -0
.gitignore ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ env/
12
+ venv/
13
+ ENV/
14
+ build/
15
+ develop-eggs/
16
+ dist/
17
+ downloads/
18
+ eggs/
19
+ .eggs/
20
+ lib/
21
+ lib64/
22
+ parts/
23
+ sdist/
24
+ var/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+
29
+ # PyInstaller
30
+ *.manifest
31
+ *.spec
32
+
33
+ # Installer logs
34
+ pip-log.txt
35
+ pip-delete-this-directory.txt
36
+
37
+ # Unit test / coverage reports
38
+ htmlcov/
39
+ .tox/
40
+ .nox/
41
+ .coverage
42
+ .coverage.*
43
+ .cache
44
+ nosetests.xml
45
+ coverage.xml
46
+ *.cover
47
+ .hypothesis/
48
+ .pytest_cache/
49
+
50
+ # Translations
51
+ *.mo
52
+ *.pot
53
+
54
+ # Django stuff:
55
+ *.log
56
+ local_settings.py
57
+ db.sqlite3
58
+
59
+ # Flask stuff:
60
+ instance/
61
+ .webassets-cache
62
+
63
+ # Scrapy stuff:
64
+ .scrapy
65
+
66
+ # Sphinx documentation
67
+ docs/_build/
68
+
69
+ # PyBuilder
70
+ target/
71
+
72
+ # Jupyter Notebook
73
+ .ipynb_checkpoints
74
+
75
+ # IPython
76
+ profile_default/
77
+ ipython_config.py
78
+
79
+ # PyCharm
80
+ .idea/
81
+
82
+ # VS Code
83
+ .vscode/
84
+
85
+ # Mac
86
+ .DS_Store
87
+
88
+ # Windows
89
+ Thumbs.db
90
+ ehthumbs.db
91
+ Desktop.ini
92
+
93
+ # Outputs
94
+ models/
95
+ outputs/
96
+
97
+ # Pickle files
98
+ *.pkl
99
+
100
+ # Others
101
+ *.csv
main.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # main.py
2
+
3
+ import os
4
+ import torch
5
+ from src import (
6
+ load_data,
7
+ preprocess_data,
8
+ encode_ids,
9
+ generate_negative_samples_vectorized,
10
+ NCFModel,
11
+ train_model,
12
+ evaluate_model
13
+ )
14
+ from torch.utils.data import DataLoader
15
+ from src import InteractionDataset
16
+
17
+ def main():
18
+ # Define directories
19
+ data_dir = 'data/'
20
+ models_dir = 'models/'
21
+ outputs_dir = 'outputs/'
22
+
23
+ # Create directories if they don't exist
24
+ os.makedirs(models_dir, exist_ok=True)
25
+ os.makedirs(outputs_dir, exist_ok=True)
26
+
27
+ # Load data
28
+ data = load_data(data_dir)
29
+
30
+ # Preprocess data
31
+ catalog, relevant_events = preprocess_data(data)
32
+
33
+ # Encode IDs
34
+ interactions, user_encoder, item_encoder = encode_ids(relevant_events)
35
+
36
+ # Save encoders
37
+ import pickle
38
+ with open(os.path.join(outputs_dir, 'user_encoder.pkl'), 'wb') as f:
39
+ pickle.dump(user_encoder, f)
40
+
41
+ with open(os.path.join(outputs_dir, 'item_encoder.pkl'), 'wb') as f:
42
+ pickle.dump(item_encoder, f)
43
+
44
+ # Split data into training and testing sets
45
+ train_data, test_data = train_test_split(interactions, test_size=0.2, random_state=42)
46
+ print(f"\nTraining data shape: {train_data.shape}")
47
+ print(f"Testing data shape: {test_data.shape}")
48
+
49
+ # Generate negative samples for training
50
+ print("Generating negative samples for training...")
51
+ train_negative = generate_negative_samples_vectorized(train_data, num_negatives=4)
52
+ train_positive = train_data[['user', 'item']].copy()
53
+ train_positive['label'] = 1
54
+ train_combined = pd.concat([train_positive, train_negative], ignore_index=True)
55
+ train_combined = train_combined.sample(frac=1, random_state=42).reset_index(drop=True)
56
+ print(f"Total training samples: {train_combined.shape[0]}")
57
+
58
+ # Save negative samples
59
+ train_negative.to_pickle(os.path.join(outputs_dir, 'train_negative.pkl'))
60
+
61
+ # Generate negative samples for testing
62
+ print("Generating negative samples for testing...")
63
+ test_negative = generate_negative_samples_vectorized(test_data, num_negatives=4)
64
+ test_positive = test_data[['user', 'item']].copy()
65
+ test_positive['label'] = 1
66
+ test_combined = pd.concat([test_positive, test_negative], ignore_index=True)
67
+ test_combined = test_combined.sample(frac=1, random_state=42).reset_index(drop=True)
68
+ print(f"Total testing samples: {test_combined.shape[0]}")
69
+
70
+ # Save negative samples
71
+ test_negative.to_pickle(os.path.join(outputs_dir, 'test_negative.pkl'))
72
+
73
+ # Define Datasets and DataLoaders
74
+ train_dataset = InteractionDataset(train_combined)
75
+ test_dataset = InteractionDataset(test_combined)
76
+
77
+ train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True, num_workers=0, pin_memory=True)
78
+ test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False, num_workers=0, pin_memory=True)
79
+
80
+ # Instantiate the model
81
+ num_users = interactions['user'].nunique()
82
+ num_items = interactions['item'].nunique()
83
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
84
+ print(f'\nUsing device: {device}')
85
+
86
+ model = NCFModel(num_users, num_items, embedding_size=50).to(device)
87
+
88
+ # Train the model
89
+ trained_model, metrics = train_model(
90
+ model=model,
91
+ train_loader=train_loader,
92
+ test_loader=test_loader,
93
+ device=device,
94
+ num_epochs=10,
95
+ patience=3,
96
+ learning_rate=0.001,
97
+ weight_decay=1e-5
98
+ )
99
+
100
+ # Evaluate the model
101
+ accuracy, roc_auc = evaluate_model(trained_model, test_loader, device)
102
+
103
+ # Save user_positive_items for recommendations
104
+ user_positive_items = defaultdict(set)
105
+ for row in train_data.itertuples(index=False):
106
+ user_positive_items[row.user].add(row.item)
107
+
108
+ import pickle
109
+ with open(os.path.join(outputs_dir, 'user_positive_items.pkl'), 'wb') as f:
110
+ pickle.dump(user_positive_items, f)
111
+
112
+ print("\nTraining and evaluation completed successfully.")
113
+
114
+ if __name__ == "__main__":
115
+ main()
model.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/model.py
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+
6
+ class NCFModel(nn.Module):
7
+ def __init__(self, num_users, num_items, embedding_size=50):
8
+ """
9
+ Initialize the NCF model with embedding layers and fully connected layers.
10
+
11
+ Args:
12
+ num_users (int): Total number of unique users.
13
+ num_items (int): Total number of unique items.
14
+ embedding_size (int): Size of the embedding vectors.
15
+ """
16
+ super(NCFModel, self).__init__()
17
+ self.user_embedding = nn.Embedding(num_users, embedding_size)
18
+ self.item_embedding = nn.Embedding(num_items, embedding_size)
19
+
20
+ self.fc1 = nn.Linear(embedding_size * 2, 128)
21
+ self.dropout1 = nn.Dropout(0.5)
22
+ self.fc2 = nn.Linear(128, 64)
23
+ self.dropout2 = nn.Dropout(0.5)
24
+ self.output_layer = nn.Linear(64, 1)
25
+
26
+ def forward(self, user, item):
27
+ """
28
+ Forward pass through the model.
29
+
30
+ Args:
31
+ user (torch.LongTensor): Tensor of user IDs.
32
+ item (torch.LongTensor): Tensor of item IDs.
33
+
34
+ Returns:
35
+ torch.Tensor: Output logits indicating interaction likelihood.
36
+ """
37
+ user_emb = self.user_embedding(user)
38
+ item_emb = self.item_embedding(item)
39
+ x = torch.cat([user_emb, item_emb], dim=1)
40
+ x = torch.relu(self.fc1(x))
41
+ x = self.dropout1(x)
42
+ x = torch.relu(self.fc2(x))
43
+ x = self.dropout2(x)
44
+ x = self.output_layer(x) # No sigmoid here; handled in loss function
45
+ return x.squeeze()
requirements.tx ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ pandas==1.5.3
2
+ numpy==1.25.2
3
+ scikit-learn==1.2.2
4
+ matplotlib==3.7.2
5
+ seaborn==0.12.2
6
+ torch==2.0.1
7
+ tqdm==4.65.0