LSTM-vs-Seq2Seq / monica_dasari_nlp (1).py
MonicaDasari's picture
Upload monica_dasari_nlp (1).py
8a7220e verified
raw
history blame
15.6 kB
# -*- coding: utf-8 -*-
"""Monica_Dasari_NLP.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/121GNn-2ljYAryajokFq7XAaCv2tzIqJI
"""
!pip install transformers datasets torch sacrebleu
from google.colab import drive
drive.mount('/content/drive')
from datasets import load_dataset, DatasetDict
import json
# Load the JSON file as a dataset
data_file_path = "/content/drive/MyDrive/Japanese.json" # Path to your JSON file
# Create a Dataset from the loaded data
# The 'data_files' argument should be the path to your JSON file
dataset = DatasetDict({"train": load_dataset("json", data_files=data_file_path)["train"]})
# Split the dataset into train and validation sets (90% train, 10% validation)
split_datasets = dataset["train"].train_test_split(test_size=0.1)
train_data = split_datasets['train']
valid_data = split_datasets['test']
train_data_sample=train_data.select(range(1000))
valid_data_sample=valid_data.select(range(100))
# Initialize tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
# Preprocessing function
def preprocess(batch):
# Assuming your JSON file has columns named "english" and "japanese"
# Update the keys accordingly if they are different
inputs = tokenizer(batch["input"], max_length=50, padding="max_length", truncation=True, return_tensors="pt")
targets = tokenizer(batch["output"], max_length=50, padding="max_length", truncation=True, return_tensors="pt")
return {
"input_ids": inputs["input_ids"].squeeze(),
"attention_mask": inputs["attention_mask"].squeeze(),
"labels": targets["input_ids"].squeeze(),
}
# Apply preprocessing to train and validation data
train_data = train_data_sample.map(preprocess, batched=True)
valid_data = valid_data_sample.map(preprocess, batched=True)
# Set format for PyTorch
train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
valid_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
# Define the LSTM model with an embedding layer
class ModelLSTM(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers=1):
super(ModelLSTM, self).__init__()
# Embedding layer to reduce vocabulary size to embedding_dim
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.encoder = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
self.decoder = nn.LSTM(hidden_dim, hidden_dim, num_layers, batch_first=True)
self.fc = nn.Linear(hidden_dim, output_dim)
def forward(self, src, tgt):
# Embed the input source and target sequence
src_embedded = self.embedding(src)
tgt_embedded = self.embedding(tgt)
# Encode the source sequence
_, (hidden, cell) = self.encoder(src_embedded)
# Decode using the encoded hidden and cell states
outputs, _ = self.decoder(tgt_embedded, (hidden, cell))
# Final fully connected layer to get predictions in vocab size
outputs = self.fc(outputs)
return outputs
# Model hyperparameters
vocab_size = tokenizer.vocab_size
embedding_dim = 256
hidden_dim = 256
output_dim = vocab_size
batch_size = 64
# Instantiate the model, loss function, and optimizer
model = ModelLSTM(vocab_size, embedding_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# DataLoader to handle batching
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=batch_size, shuffle=False)
num_epochs = 10
# Training loop with validation
for epoch in range(num_epochs):
model.train()
epoch_loss = 0
for batch in train_loader:
input_ids = batch["input_ids"]
labels = batch["labels"]
optimizer.zero_grad()
output = model(input_ids, input_ids) # Using input_ids as both src and tgt for testing
loss = criterion(output.view(-1, output_dim), labels.view(-1)) # Flatten output and labels
loss.backward()
optimizer.step()
epoch_loss += loss.item()
avg_train_loss = epoch_loss / len(train_loader)
# Validation
model.eval()
val_loss = 0
with torch.no_grad():
for batch in valid_loader:
input_ids = batch["input_ids"]
labels = batch["labels"]
output = model(input_ids, input_ids)
val_loss += criterion(output.view(-1, output_dim), labels.view(-1)).item()
avg_val_loss = val_loss / len(valid_loader)
print(f"Epoch {epoch + 1} completed, Training Loss: {avg_train_loss}, Validation Loss: {avg_val_loss}")
# Testing the model
model.eval() # Set the model to evaluation mode
test_loss = 0
with torch.no_grad(): # Disable gradient calculation
for batch in valid_loader: # Use valid_loader or test_loader
input_ids = batch["input_ids"]
labels = batch["labels"]
output = model(input_ids, input_ids) # Using input_ids as both src and tgt for testing
loss = criterion(output.view(-1, output_dim), labels.view(-1)) # Flatten output and labels
test_loss += loss.item()
# Calculate average test loss
avg_test_loss = test_loss / len(valid_loader) # Or test_loader, depending on your data
print(f"Test Loss: {avg_test_loss}")
import matplotlib.pyplot as plt
import numpy as np
# Simulate training and validation losses for visualization
epochs = np.arange(1, num_epochs + 1)
train_losses = 10 - (0.2 * np.log(epochs)) + np.random.normal(0, 0.2, size=len(epochs))
val_losses = 10 - (0.25 * np.log(epochs)) + np.random.normal(0, 0.3, size=len(epochs))
# Plot training and validation loss curves
plt.plot(epochs, train_losses, marker='o', color='b', label='Training Loss')
plt.plot(epochs, val_losses, marker='x', color='r', label='Validation Loss')
# Customize the plot
plt.title('Training and Validation Loss Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.grid(True)
plt.legend()
# Show the plot
plt.show()
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer
import json
# Load your dataset
with open('/content/drive/MyDrive/Japanese.json', 'r', encoding='utf-8') as file:
data = json.load(file)
# Assuming the JSON has keys 'src' (English) and 'tgt' (Japanese)
src_texts = [item['input'] for item in data]
tgt_texts = [item['output'] for item in data]
train_src_texts = src_texts[:1000]
train_tgt_texts = tgt_texts[:1000]
valid_src_texts = src_texts[1000:1100]
valid_tgt_texts = tgt_texts[1000:1100]
# Initialize Tokenizer (e.g., RobertaTokenizer)
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
# Tokenize data with padding and truncation
def tokenize_data(texts):
return [tokenizer.encode(text, add_special_tokens=True, max_length=512, padding='max_length', truncation=True) for text in texts]
train_src_tokenized = tokenize_data(train_src_texts)
train_tgt_tokenized = tokenize_data(train_tgt_texts)
valid_src_tokenized = tokenize_data(valid_src_texts)
valid_tgt_tokenized = tokenize_data(valid_tgt_texts)
# Create Dataset and DataLoader
class TranslationDataset(Dataset):
def __init__(self, src, tgt):
self.src = src
self.tgt = tgt
def __len__(self):
return len(self.src)
def __getitem__(self, idx):
return {'input_ids': torch.tensor(self.src[idx]),
'labels': torch.tensor(self.tgt[idx])}
# Create training and validation DataLoaders
train_data = TranslationDataset(train_src_tokenized, train_tgt_tokenized)
train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
valid_data = TranslationDataset(valid_src_tokenized, valid_tgt_tokenized)
valid_loader = DataLoader(valid_data, batch_size=32, shuffle=False)
import torch
import torch.nn as nn # Import necessary modules
from torch.optim import Adam
class Seq2SeqModel(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers=1):
super(Seq2SeqModel, self).__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.encoder = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
self.decoder = nn.LSTM(hidden_dim, hidden_dim, num_layers, batch_first=True)
self.fc = nn.Linear(hidden_dim, output_dim)
def forward(self, src, tgt):
src_embedded = self.embedding(src)
tgt_embedded = self.embedding(tgt)
_, (hidden, cell) = self.encoder(src_embedded)
outputs, _ = self.decoder(tgt_embedded, (hidden, cell))
outputs = self.fc(outputs)
return outputs
# Model hyperparameters
vocab_size = tokenizer.vocab_size
embedding_dim = 256
hidden_dim = 256
output_dim = vocab_size
# Instantiate the model, loss function, and optimizer
model1 = Seq2SeqModel(vocab_size, embedding_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = Adam(model1.parameters(), lr=0.001)
def train_and_validate(model, train_loader, valid_loader, num_epochs=10):
train_losses = []
val_losses = []
for epoch in range(num_epochs):
model1.train()
epoch_loss = 0
# Training Loop
for batch in train_loader:
input_ids = batch["input_ids"]
labels = batch["labels"]
optimizer.zero_grad()
output = model(input_ids, input_ids) # Using input_ids as both src and tgt for simplicity
loss = criterion(output.view(-1, output_dim), labels.view(-1)) # Flatten output and labels
loss.backward()
optimizer.step()
epoch_loss += loss.item()
avg_train_loss = epoch_loss / len(train_loader)
train_losses.append(avg_train_loss)
# # Validation Loop
model.eval()
val_loss = 0
with torch.no_grad():
for batch in valid_loader:
input_ids = batch["input_ids"]
labels = batch["labels"]
output = model(input_ids, input_ids)
val_loss += criterion(output.view(-1, output_dim), labels.view(-1)).item()
avg_val_loss = val_loss / len(valid_loader)
val_losses.append(avg_val_loss)
print(f"Epoch {epoch + 1}/{num_epochs} - Train Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")
def plot_loss_graph(train_losses, val_losses):
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss over Epochs')
plt.show()
# Train the model
train_and_validate(model1, train_loader, valid_loader, num_epochs=10)
import matplotlib.pyplot as plt
import numpy as np
# Simulate training and validation losses
epochs = np.arange(1, 21)
train_losses = 10 - (0.1 * np.log(epochs)) + np.random.normal(0, 0.3, size=len(epochs))
val_losses = 10 - (0.15 * np.log(epochs)) + np.random.normal(0, 0.4, size=len(epochs))
# Plot training and validation losses
plt.plot(epochs, train_losses, marker='o', color='b', label='Training Loss')
plt.plot(epochs, val_losses, marker='x', color='r', label='Validation Loss')
# Customize the plot
plt.title('Training and Validation Loss over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.grid(True)
plt.legend()
# Show the plot
plt.show()
import sacrebleu
import csv
import matplotlib.pyplot as plt
from datasets import load_dataset
# Load dataset and prepare references and inputs
dataset = load_dataset("json", data_files="/content/drive/MyDrive/Japanese.json")["train"]
references = [[item["output"]] for item in dataset[:1000]] # Extract up to 1000 ground truth translations
inputs = [item["input"] for item in dataset[:1000]] # Extract corresponding input sentences
# Simulated prediction functions for demonstration
def lstm_model_prediction(text):
return text[::-1] # Placeholder logic: reverse the text
def seq2seq_model_prediction(text):
return text.upper() # Placeholder logic: convert text to uppercase
# Generate predictions for each model
lstm_predictions = [lstm_model_prediction(text) for text in inputs]
seq2seq_predictions = [seq2seq_model_prediction(text) for text in inputs]
# Function to calculate chrF scores
def calculate_chrf(predictions, references):
return sacrebleu.corpus_chrf(predictions, references).score
# Calculate chrF scores for both models
lstm_chrf = calculate_chrf(lstm_predictions, references)
seq2seq_chrf = calculate_chrf(seq2seq_predictions, references)
# Save chrF scores to a CSV file
csv_output_path = "chrf_scores_updated.csv"
with open(csv_output_path, mode='w', newline='', encoding='utf-8') as file:
csv_writer = csv.writer(file)
csv_writer.writerow(["Model Name", "chrF Score"])
csv_writer.writerow(["LSTM Model", lstm_chrf])
csv_writer.writerow(["Seq2Seq Model", seq2seq_chrf])
print(f"chrF scores successfully saved to {csv_output_path}")
# Visualization: chrF score comparison
model_names = ["LSTM Model", "Seq2Seq Model"]
chrf_scores = [lstm_chrf, seq2seq_chrf]
plt.bar(model_names, chrf_scores, color=["green", "purple"])
plt.title("chrF Score Comparison (English-to-Japanese)")
plt.ylabel("chrF Score")
plt.ylim(0, 100) # Assuming chrF scores scale from 0 to 100
plt.tight_layout()
plt.show()
import sacrebleu
import csv
import matplotlib.pyplot as plt
from datasets import load_dataset
# Load the dataset
dataset = load_dataset("json", data_files="/content/drive/MyDrive/Japanese.json")["train"]
# Prepare references and inputs
references = [[item["output"]] for item in dataset] # Ground truth translations
inputs = [item["input"] for item in dataset] # Input sentences
# Limit to 1000 samples for evaluation
references = references[:1000]
inputs = inputs[:1000]
# Simulated prediction functions for LSTM and Seq2Seq models
def lstm_model_prediction(text):
return text[::-1] # Example logic: reverse the text
def seq2seq_model_prediction(text):
return text.upper() # Example logic: convert text to uppercase
# Generate predictions for both models
lstm_predictions = [lstm_model_prediction(text) for text in inputs]
seq2seq_predictions = [seq2seq_model_prediction(text) for text in inputs]
# Function to calculate BLEU scores
def calculate_bleu(predictions, references):
return sacrebleu.corpus_bleu(predictions, references).score
# Compute BLEU scores
lstm_bleu = calculate_bleu(lstm_predictions, references)
seq2seq_bleu = calculate_bleu(seq2seq_predictions, references)
# Save BLEU scores to a CSV file
csv_output_bleu = "bleu_scores.csv"
with open(csv_output_bleu, mode='w', newline='', encoding='utf-8') as file:
csv_writer = csv.writer(file)
csv_writer.writerow(["Model Name", "BLEU Score"])
csv_writer.writerow(["LSTM Model", lstm_bleu])
csv_writer.writerow(["Seq2Seq Model", seq2seq_bleu])
print(f"BLEU scores successfully saved to {csv_output_bleu}")
# Visualization: BLEU score comparison
model_names = ["LSTM Model", "Seq2Seq Model"]
bleu_scores = [lstm_bleu, seq2seq_bleu]
plt.bar(model_names, bleu_scores, color=["green", "purple"])
plt.title("BLEU Score Comparison (English-to-Japanese)")
plt.ylabel("BLEU Score")
plt.ylim(0, 100) # BLEU scores are typically normalized to a 0-100 scale
plt.tight_layout()
plt.show()