Spaces:
Runtime error
Runtime error
import os | |
import pandas as pd | |
from datasets import load_dataset | |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments | |
from transformers import pipeline | |
from sklearn.model_selection import train_test_split | |
def load_and_preprocess_data(): | |
dataset = load_dataset('tahiryaqoob/BISELahore') | |
train_dataset, val_dataset = train_test_split(dataset['train'], test_size=0.2, random_state=42) | |
print(f"Training samples: {len(train_dataset)}") | |
print(f"Validation samples: {len(val_dataset)}") | |
return train_dataset, val_dataset | |
#Preprocess the data to format for fine-tunin | |
def preprocess_function(examples, tokenizer): | |
inputs = tokenizer(examples['question'], padding="max_length", truncation=True, max_length=128) | |
targets = tokenizer(examples['answer'], padding="max_length", truncation=True, max_length=128) | |
inputs['labels'] = targets['input_ids'] | |
return inputs | |
def fine_tune_model(train_dataset, val_dataset): | |
model_name = "distilbert-base-uncased" | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) | |
train_dataset = train_dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True) | |
val_dataset = val_dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True) | |
training_args = TrainingArguments( | |
output_dir="./distilbert_finetuned", | |
num_train_epochs=3, | |
per_device_train_batch_size=16, | |
per_device_eval_batch_size=16, | |
warmup_steps=500, | |
weight_decay=0.01, | |
logging_dir='./logs', | |
logging_steps=200, | |
evaluation_strategy="epoch", | |
save_strategy="epoch", | |
) | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=train_dataset, | |
eval_dataset=val_dataset, | |
tokenizer=tokenizer, | |
) | |
trainer.train() | |
model.save_pretrained("./distilbert_finetuned") | |
tokenizer.save_pretrained("./distilbert_finetuned") | |
print("Model fine-tuned and saved successfully.") | |
#Create a chatbot inference pipeline using the fine-tuned model | |
def chatbot_inference(): | |
model_name = "./distilbert_finetuned" | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) | |
chatbot = pipeline("text2text-generation", model=model, tokenizer=tokenizer) | |
return chatbot | |
#Run inference to test chatbot functionality | |
def run_inference(): | |
chatbot = chatbot_inference() | |
user_input = input("Ask a question: ") | |
response = chatbot(user_input) | |
print("Bot Response:", response[0]['generated_text']) | |
#Main function to train or serve the chatbot | |
def main(): | |
train_dataset, val_dataset = load_and_preprocess_data() | |
if not os.path.exists("./distilbert_finetuned"): | |
fine_tune_model(train_dataset, val_dataset) | |
else: | |
print("Fine-tuned model already exists. Skipping fine-tuning.") | |
run_inference() | |
if __name__ == "__main__": | |
main() | |