Louisko commited on
Commit
87a4f0b
·
verified ·
1 Parent(s): ff32fe1

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +203 -0
app.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ import random
4
+ from transformers import T5Tokenizer, T5ForConditionalGeneration
5
+ from torch.utils.data import Dataset, DataLoader
6
+ from sklearn.model_selection import train_test_split
7
+ import torch.nn as nn
8
+ import torch.optim as optim
9
+ from sklearn.metrics import f1_score
10
+
11
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
12
+
13
+
14
+ def set_seed(seed_value=30):
15
+ """Set seed for reproducibility."""
16
+ random.seed(seed_value) # Python random module
17
+ np.random.seed(seed_value) # Numpy module
18
+ torch.manual_seed(seed_value) # Torch
19
+ torch.cuda.manual_seed_all(seed_value) # if you are using multi-GPU.
20
+ torch.backends.cudnn.deterministic = True # CUDNN determinism
21
+ torch.backends.cudnn.benchmark = False
22
+
23
+ # Example usage
24
+ set_seed(30)
25
+
26
+
27
+ # Load your dataset
28
+ data_path = 'final_dataset.csv' # Update this path to where your data is stored in Colab
29
+ data = pd.read_csv(data_path)
30
+
31
+ # Set up the device for GPU usage
32
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
33
+
34
+ # Load the model and tokenizer
35
+ tokenizer = T5Tokenizer.from_pretrained('t5-small')
36
+ model = T5ForConditionalGeneration.from_pretrained('t5-small')
37
+ model.to(device)
38
+ model.eval()
39
+
40
+ # Function to generate summaries
41
+ def generate_summaries(texts, model, tokenizer, device, max_length=150):
42
+ summaries = []
43
+ for text in texts:
44
+ encoded_text = tokenizer.encode("summarize: " + text, return_tensors='pt', max_length=512, truncation=True).to(device)
45
+ summary_ids = model.generate(encoded_text, max_length=max_length, num_beams=4, early_stopping=True)
46
+ summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
47
+ summaries.append(summary)
48
+ return summaries
49
+
50
+ # Split the data into chunks to manage memory more effectively (if needed)
51
+ chunk_size = 10 # Adjust chunk size based on your dataset size and memory constraints
52
+ num_chunks = len(data) // chunk_size + (1 if len(data) % chunk_size != 0 else 0)
53
+
54
+ all_summaries = []
55
+ for i in range(num_chunks):
56
+ batch = data['Content'][i * chunk_size:(i + 1) * chunk_size]
57
+ batch_summaries = generate_summaries(batch, model, tokenizer, device)
58
+ all_summaries.extend(batch_summaries)
59
+
60
+ # Add summaries to the DataFrame
61
+ data['Summary'] = all_summaries
62
+
63
+ # Save the DataFrame with summaries to a new CSV file
64
+ output_path = '/content/summarized_data.csv'
65
+ data.to_csv(output_path, index=False)
66
+ print(f"Data with summaries saved to {output_path}")
67
+
68
+ class PolicyDataset(Dataset):
69
+ def __init__(self, data, tokenizer, max_input_length=512, max_target_length=128):
70
+ self.data = data
71
+ self.tokenizer = tokenizer
72
+ self.max_input_length = max_input_length
73
+ self.max_target_length = max_target_length
74
+
75
+ def __len__(self):
76
+ return len(self.data)
77
+
78
+ def __getitem__(self, idx):
79
+ policy_text = self.data.iloc[idx]['Content']
80
+ summary_text = self.data.iloc[idx]['Summary']
81
+
82
+ input_encoding = self.tokenizer.encode_plus(
83
+ policy_text,
84
+ max_length=self.max_input_length,
85
+ padding='max_length',
86
+ truncation=True,
87
+ return_tensors='pt'
88
+ )
89
+
90
+ target_encoding = self.tokenizer.encode_plus(
91
+ summary_text,
92
+ max_length=self.max_target_length,
93
+ padding='max_length',
94
+ truncation=True,
95
+ return_tensors='pt'
96
+ )
97
+
98
+ return {
99
+ 'input_ids': input_encoding['input_ids'].squeeze(),
100
+ 'attention_mask': input_encoding['attention_mask'].squeeze(),
101
+ 'labels': target_encoding['input_ids'].squeeze(),
102
+ 'labels_mask': target_encoding['attention_mask'].squeeze()
103
+ }
104
+
105
+ data = pd.read_csv('summarized_data.csv') # Ensure this points to your CSV file
106
+ tokenizer = T5Tokenizer.from_pretrained('t5-small')
107
+ model = T5ForConditionalGeneration.from_pretrained('t5-small').to(device)
108
+
109
+ # Prepare data splits and loaders
110
+ train_data, eval_data = train_test_split(data, test_size=0.1, random_state=42)
111
+ train_dataset = PolicyDataset(train_data, tokenizer)
112
+ eval_dataset = PolicyDataset(eval_data, tokenizer)
113
+ train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
114
+ eval_loader = DataLoader(eval_dataset, batch_size=16, shuffle=False)
115
+
116
+
117
+ def train(model, train_loader, optimizer, criterion, device):
118
+ model.train()
119
+ total_loss = 0
120
+ for batch in train_loader:
121
+ optimizer.zero_grad()
122
+
123
+ input_ids = batch['input_ids'].to(device)
124
+ attention_mask = batch['attention_mask'].to(device)
125
+ labels = batch['labels'].to(device) # Labels should be of the shape [batch_size, seq_length]
126
+
127
+ outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
128
+ logits = outputs.logits # Output logits are typically [batch_size, seq_length, vocab_size]
129
+
130
+ # Reshape labels to match the output logits dimensions if needed
131
+ # labels should be [batch_size * seq_length] when passed to CrossEntropyLoss
132
+ loss = criterion(logits.view(-1, logits.size(-1)), labels.view(-1))
133
+ loss.backward()
134
+ optimizer.step()
135
+
136
+ total_loss += loss.item()
137
+
138
+ return total_loss / len(train_loader)
139
+
140
+ def evaluate(model, eval_loader, criterion, device):
141
+ model.eval()
142
+ total_loss = 0
143
+ all_predictions = []
144
+ all_labels = []
145
+ with torch.no_grad():
146
+ for batch in eval_loader:
147
+ input_ids = batch['input_ids'].to(device)
148
+ attention_mask = batch['attention_mask'].to(device)
149
+ labels = batch['labels'].to(device)
150
+
151
+ outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
152
+ logits = outputs.logits
153
+
154
+ # Calculate loss
155
+ loss = criterion(logits.view(-1, logits.size(-1)), labels.view(-1))
156
+ total_loss += loss.item()
157
+
158
+ # Calculate F1 score
159
+ predictions = torch.argmax(logits, dim=-1).flatten().cpu().numpy()
160
+ labels_flat = labels.flatten().cpu().numpy()
161
+ valid_indices = labels_flat != -100
162
+ valid_predictions = predictions[valid_indices]
163
+ valid_labels = labels_flat[valid_indices]
164
+ all_predictions.extend(valid_predictions)
165
+ all_labels.extend(valid_labels)
166
+
167
+ f1 = f1_score(all_labels, all_predictions, average='macro')
168
+ return total_loss / len(eval_loader), f1
169
+
170
+ optimizer = optim.AdamW(model.parameters(), lr=5e-5)
171
+ criterion = nn.CrossEntropyLoss()
172
+
173
+ # Training loop
174
+ for epoch in range(5): # Adjust the number of epochs as needed
175
+ train_loss = train(model, train_loader, optimizer, criterion, device)
176
+ eval_loss, eval_f1 = evaluate(model, eval_loader, criterion, device)
177
+ print(f"Epoch {epoch + 1}: Train Loss = {train_loss:.4f}, Eval Loss = {eval_loss:.4f}, Eval F1 = {eval_f1:.4f}")
178
+
179
+
180
+ # Function to run training
181
+ def run_training(lr, batch_size, number_of_epochs=5):
182
+ model = T5ForConditionalGeneration.from_pretrained('t5-small').to(device)
183
+ model.train()
184
+ train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
185
+ optimizer = optim.AdamW(model.parameters(), lr=lr)
186
+ criterion = torch.nn.CrossEntropyLoss()
187
+
188
+ # Training loop
189
+ for epoch in range(number_of_epochs):
190
+ train_loss = train(model, train_loader, optimizer, criterion, device)
191
+ eval_loss, eval_f1 = evaluate(model, eval_loader, criterion, device)
192
+ print(f"LR: {lr}, Batch size: {batch_size}, Epoch: {epoch+1}, Train Loss: {train_loss:.4f}, Eval Loss: {eval_loss:.4f}, Eval F1: {eval_f1:.4f}")
193
+
194
+ # Define hyperparameters to test
195
+ learning_rates = [1e-5, 3e-5, 5e-5]
196
+ batch_sizes = [16, 32, 64]
197
+
198
+ # Run grid search
199
+ for lr in learning_rates:
200
+ for batch_size in batch_sizes:
201
+ run_training(lr, batch_size, number_of_epochs=5) # Specify the number of epochs here
202
+
203
+