Spaces:

AliArshad
/

SeverityPrediction

Sleeping

App Files Files Community

AliArshad commited on Nov 9, 2024

Commit

5e2f008

verified ·

1 Parent(s): 794588f

Delete code.py

Browse files

Files changed (1) hide show

code.py +0 -157

code.py DELETED Viewed

@@ -1,157 +0,0 @@
-import numpy as np
-import pandas as pd
-import torch
-from transformers import XLNetTokenizer, XLNetForSequenceClassification, Trainer, TrainingArguments
-from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, confusion_matrix
-# Mount Google Drive
-from google.colab import drive
-drive.mount('/content/drive', force_remount=True)
-# Path to your Excel file in Google Drive
-file_path = '/content/drive/My Drive/filtered_data.xlsx'
-# Read the Excel file into a pandas DataFrame
-df = pd.read_excel(file_path)
-# Selecting only the necessary columns
-selected_columns = ['Short Description', 'Severity Label', 'Project']
-new_df = df[selected_columns].copy()
-# Exclude bug reports with 'normal' severity
-filtered_df = new_df[new_df['Severity Label'] != 'normal']
-# Define mapping for label conversion
-severity_mapping = {
-    'blocker': 'severe',
-    'critical': 'severe',
-    'major': 'severe',
-    'trivial': 'non-severe',
-    'minor': 'non-severe'
-}
-# Replace severity labels according to the mapping
-filtered_df.loc[:, 'Severity Label'] = filtered_df['Severity Label'].map(severity_mapping)
-# Mapping string labels to numeric representations
-label_mapping = {'non-severe': 0, 'severe': 1}
-filtered_df.loc[:, 'Severity Label'] = filtered_df['Severity Label'].map(label_mapping)
-####
-# Initialize XLNet tokenizer and model
-tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
-# Define parameters
-max_len = 100  # Max sequence length
-batch_size = 32
-epochs = 5
-# Initialize evaluation results dictionary
-evaluation_results = {}
-# Iterate through each unique project as the test set
-for test_project in filtered_df['Project'].unique():
-    # Reinitialize the model for each test project
-    model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=2)  # Define num_labels for binary classification
-    # Select data for the current test project
-    test_data = filtered_df[filtered_df['Project'] == test_project]
-    train_data = filtered_df[filtered_df['Project'] != test_project]
-    # Split train and test data
-    train_texts = train_data['Short Description'].tolist()
-    train_labels = train_data['Severity Label'].tolist()
-    test_texts = test_data['Short Description'].tolist()
-    test_labels = test_data['Severity Label'].tolist()
-    # Tokenize train and test data
-    train_encodings = tokenizer(train_texts, truncation=True, padding='max_length', max_length=max_len)
-    test_encodings = tokenizer(test_texts, truncation=True, padding='max_length', max_length=max_len)
-    # Create PyTorch datasets
-    class CustomDataset(torch.utils.data.Dataset):
-        def __init__(self, encodings, labels):
-            self.encodings = encodings
-            self.labels = labels
-        def __getitem__(self, idx):
-            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
-            item['labels'] = torch.tensor(self.labels[idx])
-            return item
-        def __len__(self):
-            return len(self.labels)
-    train_dataset = CustomDataset(train_encodings, train_labels)
-    test_dataset = CustomDataset(test_encodings, test_labels)
-    # Define training arguments
-    training_args = TrainingArguments(
-        output_dir='./results',  # output directory
-        num_train_epochs=epochs,  # total number of training epochs
-        per_device_train_batch_size=batch_size,  # batch size per device during training
-        per_device_eval_batch_size=batch_size,   # batch size for evaluation
-        warmup_steps=500,  # number of warmup steps for learning rate scheduler
-        weight_decay=0.01,  # strength of weight decay
-        logging_dir='./logs',  # directory for storing logs
-    )
-    # Define trainer
-    trainer = Trainer(
-        model=model,
-        args=training_args,
-        train_dataset=train_dataset,
-    )
-    # Train the model
-    trainer.train()
-    # Save the model to Google Drive
-    model_save_path = '/content/drive/My Drive/XLNet_model_project_{}.pt'.format(test_project)
-    torch.save(model.state_dict(), model_save_path)
-    print(f"Model saved to '{model_save_path}'")
-    # Evaluate the model
-    predictions = trainer.predict(test_dataset)
-    preds = np.argmax(predictions.predictions, axis=1)
-    # Calculate evaluation metrics
-    accuracy = accuracy_score(test_labels, preds)
-    precision = precision_score(test_labels, preds)
-    recall = recall_score(test_labels, preds)
-    f1 = f1_score(test_labels, preds)
-    mcc = matthews_corrcoef(test_labels, preds)
-    conf_matrix = confusion_matrix(test_labels, preds)
-    # Store evaluation results for the current test project
-    evaluation_results[test_project] = {
-        'Accuracy': accuracy,
-        'Precision': precision,
-        'Recall': recall,
-        'F1-score': f1,
-        'MCC': mcc,
-        'Confusion Matrix': conf_matrix
-    }
-# Print evaluation results for all test projects
-for project, results in evaluation_results.items():
-    print(f"Evaluation results for Test Project '{project}':")
-    for metric, value in results.items():
-        if metric != 'Confusion Matrix':
-            print(f"{metric}: {value}")
-        else:
-            print(f"{metric}:")
-            print(value)
-    print("------------------------------")
-# Convert evaluation results to a DataFrame
-df_results = pd.DataFrame.from_dict(evaluation_results, orient='index')
-# Save results to an Excel file
-excel_file_name = '/content/drive/My Drive/evaluation_results_XLNet.xlsx'
-df_results.to_excel(excel_file_name)
-print(f"Results saved to '{excel_file_name}'")