Spaces:

NavyaNayer
/

Moodify-Task

Sleeping

App Files Files Community

NavyaNayer commited on May 24

Commit

3a42b37

verified ·

1 Parent(s): 613ca7d

Delete complexity_Score_finetuned.py

Browse files

Files changed (1) hide show

complexity_Score_finetuned.py +0 -273

complexity_Score_finetuned.py DELETED Viewed

@@ -1,273 +0,0 @@
-<<<<<<< HEAD
-import torch
-import random
-import numpy as np
-from tqdm import tqdm
-from datasets import load_dataset
-from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
-from torch.utils.data import DataLoader
-from transformers import AdamW
-from sklearn.metrics import r2_score, f1_score, mean_absolute_error
-# Set random seed for reproducibility
-torch.manual_seed(42)
-np.random.seed(42)
-random.seed(42)
-# Load DEITA-Complexity dataset
-dataset = load_dataset("hkust-nlp/deita-complexity-scorer-data")
-val_data = dataset["validation"]
-# Initialize tokenizer
-tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
-# Preprocessing function
-def preprocess_function(examples):
-    return tokenizer(examples["input"], truncation=True, padding="max_length", max_length=128)
-# Tokenize validation dataset
-val_encodings = val_data.map(preprocess_function, batched=True)
-# Inspect the structure of val_encodings
-print("Validation Encodings Structure:")
-print(val_encodings)
-# Convert dataset to PyTorch format
-class ComplexityDataset(torch.utils.data.Dataset):
-    def __init__(self, encodings):
-        self.encodings = encodings
-    def __len__(self):
-        return len(self.encodings['input_ids'])
-    def __getitem__(self, idx):
-        # Create a dictionary for the inputs
-        item = {
-            "input_ids": torch.tensor(self.encodings['input_ids'][idx]),
-            "attention_mask": torch.tensor(self.encodings['attention_mask'][idx]),
-            # Convert target to float if it's a string
-            "labels": torch.tensor(float(self.encodings['target'][idx]), dtype=torch.float)  # Ensure 'target' is numeric
-        }
-        return item
-val_dataset = ComplexityDataset(val_encodings)
-# Load pre-trained DistilBERT model
-model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=1)
-# Freeze first 4 transformer layers
-for layer in model.distilbert.transformer.layer[:4]:
-    for param in layer.parameters():
-        param.requires_grad = False
-# Define optimizer
-optimizer = AdamW(model.parameters(), lr=2e-5)
-# Use GPU if available
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model.to(device)
-# DataLoader for batching
-val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
-# Evaluation function
-def evaluate_model(model, val_loader):
-    model.eval()
-    val_loss = 0.0
-    total_mae = 0.0
-    all_predictions = []
-    all_labels = []
-    with torch.no_grad():
-        for batch in tqdm(val_loader, desc="Evaluating", leave=False):
-            batch = {key: val.to(device) for key, val in batch.items()}
-            outputs = model(**batch)
-            loss = torch.nn.functional.mse_loss(outputs.logits.squeeze(), batch["labels"])
-            val_loss += loss.item()
-            total_mae += torch.nn.functional.l1_loss(outputs.logits.squeeze(), batch["labels"], reduction="sum").item()
-            all_predictions.extend(outputs.logits.squeeze().cpu().numpy())
-            all_labels.extend(batch["labels"].cpu().numpy())
-    avg_val_loss = val_loss / len(val_loader)
-    avg_val_mae = total_mae / len(val_loader.dataset)
-    # Calculate additional metrics
-    r2 = r2_score(all_labels, all_predictions)
-    f1 = f1_score(np.round(all_labels), np.round(all_predictions), average='weighted')
-    return avg_val_loss, avg_val_mae, r2, f1, all_predictions, all_labels
-# Evaluate the model
-val_loss, val_mae, r2, f1, predictions, labels = evaluate_model(model, val_loader)
-print(f"Validation Loss = {val_loss:.4f}, Validation MAE = {val_mae:.4f}, R² Score = {r2:.4f}, F1 Score = {f1:.4f}")
-# Testing the model (inference on the validation set)
-def test_model(model, val_loader):
-    model.eval()
-    all_predictions = []
-    all_labels = []
-    with torch.no_grad():
-        for batch in tqdm(val_loader, desc="Testing", leave=False):
-            batch = {key: val.to(device) for key, val in batch.items()}
-            outputs = model(**batch)
-            all_predictions.extend(outputs.logits.squeeze().cpu().numpy())
-            all_labels.extend(batch["labels"].cpu().numpy())
-    return np.array(all_predictions), np.array(all_labels)
-# Get predictions and labels from the test function
-test_predictions, test_labels = test_model(model, val_loader)
-# You can also calculate the evaluation metrics on the test predictions
-test_r2 = r2_score(test_labels, test_predictions)
-test_f1 = f1_score(np.round(test_labels), np.round(test_predictions), average='weighted')
-print(f"Test R² Score = {test_r2:.4f}, Test F1 Score = {test_f1:.4f}")
-# Save the fine-tuned model
-model.save_pretrained("fine_tuned_deita_model")
-tokenizer.save_pretrained("fine_tuned_deita_model")
-print("✅ Evaluation and testing complete! Model saved at 'fine_tuned_deita_model'.")
-=======
-import torch
-import random
-import numpy as np
-from tqdm import tqdm
-from datasets import load_dataset
-from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
-from torch.utils.data import DataLoader
-from transformers import AdamW
-from sklearn.metrics import r2_score, f1_score, mean_absolute_error
-# Set random seed for reproducibility
-torch.manual_seed(42)
-np.random.seed(42)
-random.seed(42)
-# Load DEITA-Complexity dataset
-dataset = load_dataset("hkust-nlp/deita-complexity-scorer-data")
-val_data = dataset["validation"]
-# Initialize tokenizer
-tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
-# Preprocessing function
-def preprocess_function(examples):
-    return tokenizer(examples["input"], truncation=True, padding="max_length", max_length=128)
-# Tokenize validation dataset
-val_encodings = val_data.map(preprocess_function, batched=True)
-# Inspect the structure of val_encodings
-print("Validation Encodings Structure:")
-print(val_encodings)
-# Convert dataset to PyTorch format
-class ComplexityDataset(torch.utils.data.Dataset):
-    def __init__(self, encodings):
-        self.encodings = encodings
-    def __len__(self):
-        return len(self.encodings['input_ids'])
-    def __getitem__(self, idx):
-        # Create a dictionary for the inputs
-        item = {
-            "input_ids": torch.tensor(self.encodings['input_ids'][idx]),
-            "attention_mask": torch.tensor(self.encodings['attention_mask'][idx]),
-            # Convert target to float if it's a string
-            "labels": torch.tensor(float(self.encodings['target'][idx]), dtype=torch.float)  # Ensure 'target' is numeric
-        }
-        return item
-val_dataset = ComplexityDataset(val_encodings)
-# Load pre-trained DistilBERT model
-model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=1)
-# Freeze first 4 transformer layers
-for layer in model.distilbert.transformer.layer[:4]:
-    for param in layer.parameters():
-        param.requires_grad = False
-# Define optimizer
-optimizer = AdamW(model.parameters(), lr=2e-5)
-# Use GPU if available
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model.to(device)
-# DataLoader for batching
-val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
-# Evaluation function
-def evaluate_model(model, val_loader):
-    model.eval()
-    val_loss = 0.0
-    total_mae = 0.0
-    all_predictions = []
-    all_labels = []
-    with torch.no_grad():
-        for batch in tqdm(val_loader, desc="Evaluating", leave=False):
-            batch = {key: val.to(device) for key, val in batch.items()}
-            outputs = model(**batch)
-            loss = torch.nn.functional.mse_loss(outputs.logits.squeeze(), batch["labels"])
-            val_loss += loss.item()
-            total_mae += torch.nn.functional.l1_loss(outputs.logits.squeeze(), batch["labels"], reduction="sum").item()
-            all_predictions.extend(outputs.logits.squeeze().cpu().numpy())
-            all_labels.extend(batch["labels"].cpu().numpy())
-    avg_val_loss = val_loss / len(val_loader)
-    avg_val_mae = total_mae / len(val_loader.dataset)
-    # Calculate additional metrics
-    r2 = r2_score(all_labels, all_predictions)
-    f1 = f1_score(np.round(all_labels), np.round(all_predictions), average='weighted')
-    return avg_val_loss, avg_val_mae, r2, f1, all_predictions, all_labels
-# Evaluate the model
-val_loss, val_mae, r2, f1, predictions, labels = evaluate_model(model, val_loader)
-print(f"Validation Loss = {val_loss:.4f}, Validation MAE = {val_mae:.4f}, R² Score = {r2:.4f}, F1 Score = {f1:.4f}")
-# Testing the model (inference on the validation set)
-def test_model(model, val_loader):
-    model.eval()
-    all_predictions = []
-    all_labels = []
-    with torch.no_grad():
-        for batch in tqdm(val_loader, desc="Testing", leave=False):
-            batch = {key: val.to(device) for key, val in batch.items()}
-            outputs = model(**batch)
-            all_predictions.extend(outputs.logits.squeeze().cpu().numpy())
-            all_labels.extend(batch["labels"].cpu().numpy())
-    return np.array(all_predictions), np.array(all_labels)
-# Get predictions and labels from the test function
-test_predictions, test_labels = test_model(model, val_loader)
-# You can also calculate the evaluation metrics on the test predictions
-test_r2 = r2_score(test_labels, test_predictions)
-test_f1 = f1_score(np.round(test_labels), np.round(test_predictions), average='weighted')
-print(f"Test R² Score = {test_r2:.4f}, Test F1 Score = {test_f1:.4f}")
-# Save the fine-tuned model
-model.save_pretrained("fine_tuned_deita_model")
-tokenizer.save_pretrained("fine_tuned_deita_model")
-print("✅ Evaluation and testing complete! Model saved at 'fine_tuned_deita_model'.")
->>>>>>> b1313c5d084e410cadf261f2fafd8929cb149a4f