NavyaNayer commited on
Commit
cc40705
·
verified ·
1 Parent(s): 3a42b37

Delete emotions.py

Browse files
Files changed (1) hide show
  1. emotions.py +0 -235
emotions.py DELETED
@@ -1,235 +0,0 @@
1
- <<<<<<< HEAD
2
- import pandas as pd
3
- import torch
4
- from datasets import load_dataset, Dataset
5
- from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
6
- import numpy as np
7
- from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
8
-
9
- # Load dataset
10
- dataset = load_dataset("go_emotions")
11
-
12
- # Print dataset columns
13
- print("Dataset Columns Before Preprocessing:", dataset["train"].column_names)
14
-
15
- # Ensure labels exist
16
- if "labels" not in dataset["train"].column_names:
17
- raise KeyError("Column 'labels' is missing! Check dataset structure.")
18
-
19
- # Load tokenizer
20
- model_checkpoint = "distilbert-base-uncased"
21
-
22
- tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
23
-
24
- # Preprocessing function (Take only the first label for single-label classification)
25
- def preprocess_data(batch):
26
- encoding = tokenizer(batch["text"], padding="max_length", truncation=True)
27
-
28
- # Take only the first label (for single-label classification)
29
- encoding["labels"] = batch["labels"][0] if batch["labels"] else 0 # Default to 0 if empty
30
- return encoding
31
-
32
- # Tokenize dataset
33
- encoded_dataset = dataset.map(preprocess_data, batched=False, remove_columns=["text"])
34
-
35
- # Set format for PyTorch
36
- encoded_dataset.set_format("torch")
37
-
38
- # Load model for single-label classification (28 classes)
39
- num_labels = 28 # Change based on dataset labels
40
- model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
41
-
42
- # Training arguments
43
- args = TrainingArguments(
44
- output_dir="./results",
45
- eval_strategy="epoch",
46
- save_strategy="epoch",
47
- save_total_limit=1,
48
- logging_strategy="no",
49
- per_device_train_batch_size=32, # Increase batch size
50
- per_device_eval_batch_size=32,
51
- num_train_epochs=2, # Reduce epochs
52
- weight_decay=0.01,
53
- load_best_model_at_end=True,
54
- fp16=True, # Mixed precision for speedup
55
- gradient_accumulation_steps=2, # Helps with large batch sizes
56
- )
57
-
58
-
59
- # Compute metrics function
60
- def compute_metrics(eval_pred):
61
- logits, labels = eval_pred
62
-
63
- # Convert logits to class predictions
64
- predictions = np.argmax(logits, axis=-1)
65
-
66
- accuracy = accuracy_score(labels, predictions)
67
- f1 = f1_score(labels, predictions, average="weighted")
68
-
69
- return {"accuracy": accuracy, "f1": f1}
70
-
71
- # Initialize Trainer
72
- trainer = Trainer(
73
- model=model,
74
- args=args,
75
- train_dataset=encoded_dataset["train"],
76
- eval_dataset=encoded_dataset["validation"],
77
- compute_metrics=compute_metrics
78
- )
79
-
80
- # Train model
81
- trainer.train()
82
- print("Training completed!")
83
-
84
- # Save model and tokenizer
85
- model.save_pretrained("./saved_model")
86
- tokenizer.save_pretrained("./saved_model")
87
- print("Model and tokenizer saved!")
88
-
89
- # ====== Evaluation on Test Set ======
90
- print("\nEvaluating model on test set...")
91
-
92
- # Get test dataset
93
- test_dataset = encoded_dataset["test"]
94
-
95
- # Make predictions
96
- predictions = trainer.predict(test_dataset)
97
- logits = predictions.predictions
98
-
99
- # Convert logits to class predictions
100
- y_pred = np.argmax(logits, axis=-1)
101
- y_true = test_dataset["labels"].numpy()
102
-
103
- # Compute accuracy and F1-score
104
- accuracy = accuracy_score(y_true, y_pred)
105
- f1 = f1_score(y_true, y_pred, average="weighted")
106
-
107
- # Print evaluation results
108
- print("\nEvaluation Results:")
109
- print(f"Test Accuracy: {accuracy:.4f}")
110
- print(f"Test F1 Score: {f1:.4f}")
111
-
112
- # Print classification report
113
- print("\nClassification Report:\n", classification_report(y_true, y_pred))
114
-
115
- # Save test results
116
- pd.DataFrame({"true_labels": y_true.tolist(), "predicted_labels": y_pred.tolist()}).to_csv("test_results.csv", index=False)
117
- print("Test results saved to 'test_results.csv'!")
118
- =======
119
- import pandas as pd
120
- import torch
121
- from datasets import load_dataset, Dataset
122
- from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
123
- import numpy as np
124
- from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
125
-
126
- # Load dataset
127
- dataset = load_dataset("go_emotions")
128
-
129
- # Print dataset columns
130
- print("Dataset Columns Before Preprocessing:", dataset["train"].column_names)
131
-
132
- # Ensure labels exist
133
- if "labels" not in dataset["train"].column_names:
134
- raise KeyError("Column 'labels' is missing! Check dataset structure.")
135
-
136
- # Load tokenizer
137
- model_checkpoint = "distilbert-base-uncased"
138
-
139
- tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
140
-
141
- # Preprocessing function (Take only the first label for single-label classification)
142
- def preprocess_data(batch):
143
- encoding = tokenizer(batch["text"], padding="max_length", truncation=True)
144
-
145
- # Take only the first label (for single-label classification)
146
- encoding["labels"] = batch["labels"][0] if batch["labels"] else 0 # Default to 0 if empty
147
- return encoding
148
-
149
- # Tokenize dataset
150
- encoded_dataset = dataset.map(preprocess_data, batched=False, remove_columns=["text"])
151
-
152
- # Set format for PyTorch
153
- encoded_dataset.set_format("torch")
154
-
155
- # Load model for single-label classification (28 classes)
156
- num_labels = 28 # Change based on dataset labels
157
- model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
158
-
159
- # Training arguments
160
- args = TrainingArguments(
161
- output_dir="./results",
162
- eval_strategy="epoch",
163
- save_strategy="epoch",
164
- save_total_limit=1,
165
- logging_strategy="no",
166
- per_device_train_batch_size=32, # Increase batch size
167
- per_device_eval_batch_size=32,
168
- num_train_epochs=2, # Reduce epochs
169
- weight_decay=0.01,
170
- load_best_model_at_end=True,
171
- fp16=True, # Mixed precision for speedup
172
- gradient_accumulation_steps=2, # Helps with large batch sizes
173
- )
174
-
175
-
176
- # Compute metrics function
177
- def compute_metrics(eval_pred):
178
- logits, labels = eval_pred
179
-
180
- # Convert logits to class predictions
181
- predictions = np.argmax(logits, axis=-1)
182
-
183
- accuracy = accuracy_score(labels, predictions)
184
- f1 = f1_score(labels, predictions, average="weighted")
185
-
186
- return {"accuracy": accuracy, "f1": f1}
187
-
188
- # Initialize Trainer
189
- trainer = Trainer(
190
- model=model,
191
- args=args,
192
- train_dataset=encoded_dataset["train"],
193
- eval_dataset=encoded_dataset["validation"],
194
- compute_metrics=compute_metrics
195
- )
196
-
197
- # Train model
198
- trainer.train()
199
- print("Training completed!")
200
-
201
- # Save model and tokenizer
202
- model.save_pretrained("./saved_model")
203
- tokenizer.save_pretrained("./saved_model")
204
- print("Model and tokenizer saved!")
205
-
206
- # ====== Evaluation on Test Set ======
207
- print("\nEvaluating model on test set...")
208
-
209
- # Get test dataset
210
- test_dataset = encoded_dataset["test"]
211
-
212
- # Make predictions
213
- predictions = trainer.predict(test_dataset)
214
- logits = predictions.predictions
215
-
216
- # Convert logits to class predictions
217
- y_pred = np.argmax(logits, axis=-1)
218
- y_true = test_dataset["labels"].numpy()
219
-
220
- # Compute accuracy and F1-score
221
- accuracy = accuracy_score(y_true, y_pred)
222
- f1 = f1_score(y_true, y_pred, average="weighted")
223
-
224
- # Print evaluation results
225
- print("\nEvaluation Results:")
226
- print(f"Test Accuracy: {accuracy:.4f}")
227
- print(f"Test F1 Score: {f1:.4f}")
228
-
229
- # Print classification report
230
- print("\nClassification Report:\n", classification_report(y_true, y_pred))
231
-
232
- # Save test results
233
- pd.DataFrame({"true_labels": y_true.tolist(), "predicted_labels": y_pred.tolist()}).to_csv("test_results.csv", index=False)
234
- print("Test results saved to 'test_results.csv'!")
235
- >>>>>>> b1313c5d084e410cadf261f2fafd8929cb149a4f