AliArshad commited on
Commit
5e2f008
·
verified ·
1 Parent(s): 794588f

Delete code.py

Browse files
Files changed (1) hide show
  1. code.py +0 -157
code.py DELETED
@@ -1,157 +0,0 @@
1
- import numpy as np
2
- import pandas as pd
3
- import torch
4
- from transformers import XLNetTokenizer, XLNetForSequenceClassification, Trainer, TrainingArguments
5
- from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, confusion_matrix
6
-
7
- # Mount Google Drive
8
- from google.colab import drive
9
- drive.mount('/content/drive', force_remount=True)
10
-
11
- # Path to your Excel file in Google Drive
12
- file_path = '/content/drive/My Drive/filtered_data.xlsx'
13
-
14
- # Read the Excel file into a pandas DataFrame
15
- df = pd.read_excel(file_path)
16
-
17
- # Selecting only the necessary columns
18
- selected_columns = ['Short Description', 'Severity Label', 'Project']
19
- new_df = df[selected_columns].copy()
20
-
21
- # Exclude bug reports with 'normal' severity
22
- filtered_df = new_df[new_df['Severity Label'] != 'normal']
23
-
24
- # Define mapping for label conversion
25
- severity_mapping = {
26
- 'blocker': 'severe',
27
- 'critical': 'severe',
28
- 'major': 'severe',
29
- 'trivial': 'non-severe',
30
- 'minor': 'non-severe'
31
- }
32
-
33
- # Replace severity labels according to the mapping
34
- filtered_df.loc[:, 'Severity Label'] = filtered_df['Severity Label'].map(severity_mapping)
35
-
36
- # Mapping string labels to numeric representations
37
- label_mapping = {'non-severe': 0, 'severe': 1}
38
- filtered_df.loc[:, 'Severity Label'] = filtered_df['Severity Label'].map(label_mapping)
39
-
40
-
41
- ####
42
-
43
- # Initialize XLNet tokenizer and model
44
- tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
45
-
46
-
47
- # Define parameters
48
- max_len = 100 # Max sequence length
49
- batch_size = 32
50
- epochs = 5
51
-
52
- # Initialize evaluation results dictionary
53
- evaluation_results = {}
54
-
55
- # Iterate through each unique project as the test set
56
- for test_project in filtered_df['Project'].unique():
57
-
58
- # Reinitialize the model for each test project
59
- model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=2) # Define num_labels for binary classification
60
-
61
- # Select data for the current test project
62
- test_data = filtered_df[filtered_df['Project'] == test_project]
63
- train_data = filtered_df[filtered_df['Project'] != test_project]
64
-
65
- # Split train and test data
66
- train_texts = train_data['Short Description'].tolist()
67
- train_labels = train_data['Severity Label'].tolist()
68
- test_texts = test_data['Short Description'].tolist()
69
- test_labels = test_data['Severity Label'].tolist()
70
-
71
- # Tokenize train and test data
72
- train_encodings = tokenizer(train_texts, truncation=True, padding='max_length', max_length=max_len)
73
- test_encodings = tokenizer(test_texts, truncation=True, padding='max_length', max_length=max_len)
74
-
75
- # Create PyTorch datasets
76
- class CustomDataset(torch.utils.data.Dataset):
77
- def __init__(self, encodings, labels):
78
- self.encodings = encodings
79
- self.labels = labels
80
-
81
- def __getitem__(self, idx):
82
- item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
83
- item['labels'] = torch.tensor(self.labels[idx])
84
- return item
85
-
86
- def __len__(self):
87
- return len(self.labels)
88
-
89
- train_dataset = CustomDataset(train_encodings, train_labels)
90
- test_dataset = CustomDataset(test_encodings, test_labels)
91
-
92
- # Define training arguments
93
- training_args = TrainingArguments(
94
- output_dir='./results', # output directory
95
- num_train_epochs=epochs, # total number of training epochs
96
- per_device_train_batch_size=batch_size, # batch size per device during training
97
- per_device_eval_batch_size=batch_size, # batch size for evaluation
98
- warmup_steps=500, # number of warmup steps for learning rate scheduler
99
- weight_decay=0.01, # strength of weight decay
100
- logging_dir='./logs', # directory for storing logs
101
- )
102
-
103
- # Define trainer
104
- trainer = Trainer(
105
- model=model,
106
- args=training_args,
107
- train_dataset=train_dataset,
108
- )
109
-
110
- # Train the model
111
- trainer.train()
112
-
113
- # Save the model to Google Drive
114
- model_save_path = '/content/drive/My Drive/XLNet_model_project_{}.pt'.format(test_project)
115
- torch.save(model.state_dict(), model_save_path)
116
- print(f"Model saved to '{model_save_path}'")
117
-
118
- # Evaluate the model
119
- predictions = trainer.predict(test_dataset)
120
- preds = np.argmax(predictions.predictions, axis=1)
121
-
122
- # Calculate evaluation metrics
123
- accuracy = accuracy_score(test_labels, preds)
124
- precision = precision_score(test_labels, preds)
125
- recall = recall_score(test_labels, preds)
126
- f1 = f1_score(test_labels, preds)
127
- mcc = matthews_corrcoef(test_labels, preds)
128
- conf_matrix = confusion_matrix(test_labels, preds)
129
-
130
- # Store evaluation results for the current test project
131
- evaluation_results[test_project] = {
132
- 'Accuracy': accuracy,
133
- 'Precision': precision,
134
- 'Recall': recall,
135
- 'F1-score': f1,
136
- 'MCC': mcc,
137
- 'Confusion Matrix': conf_matrix
138
- }
139
-
140
- # Print evaluation results for all test projects
141
- for project, results in evaluation_results.items():
142
- print(f"Evaluation results for Test Project '{project}':")
143
- for metric, value in results.items():
144
- if metric != 'Confusion Matrix':
145
- print(f"{metric}: {value}")
146
- else:
147
- print(f"{metric}:")
148
- print(value)
149
- print("------------------------------")
150
-
151
- # Convert evaluation results to a DataFrame
152
- df_results = pd.DataFrame.from_dict(evaluation_results, orient='index')
153
-
154
- # Save results to an Excel file
155
- excel_file_name = '/content/drive/My Drive/evaluation_results_XLNet.xlsx'
156
- df_results.to_excel(excel_file_name)
157
- print(f"Results saved to '{excel_file_name}'")