tahiryaqoob's picture
Update app.py
1ee4d14 verified
import os
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from transformers import pipeline
from sklearn.model_selection import train_test_split
def load_and_preprocess_data():
dataset = load_dataset('tahiryaqoob/BISELahore')
train_dataset, val_dataset = train_test_split(dataset['train'], test_size=0.2, random_state=42)
print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")
return train_dataset, val_dataset
#Preprocess the data to format for fine-tunin
def preprocess_function(examples, tokenizer):
inputs = tokenizer(examples['question'], padding="max_length", truncation=True, max_length=128)
targets = tokenizer(examples['answer'], padding="max_length", truncation=True, max_length=128)
inputs['labels'] = targets['input_ids']
return inputs
def fine_tune_model(train_dataset, val_dataset):
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
train_dataset = train_dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)
val_dataset = val_dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)
training_args = TrainingArguments(
output_dir="./distilbert_finetuned",
num_train_epochs=3,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
warmup_steps=500,
weight_decay=0.01,
logging_dir='./logs',
logging_steps=200,
evaluation_strategy="epoch",
save_strategy="epoch",
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
tokenizer=tokenizer,
)
trainer.train()
model.save_pretrained("./distilbert_finetuned")
tokenizer.save_pretrained("./distilbert_finetuned")
print("Model fine-tuned and saved successfully.")
#Create a chatbot inference pipeline using the fine-tuned model
def chatbot_inference():
model_name = "./distilbert_finetuned"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
chatbot = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
return chatbot
#Run inference to test chatbot functionality
def run_inference():
chatbot = chatbot_inference()
user_input = input("Ask a question: ")
response = chatbot(user_input)
print("Bot Response:", response[0]['generated_text'])
#Main function to train or serve the chatbot
def main():
train_dataset, val_dataset = load_and_preprocess_data()
if not os.path.exists("./distilbert_finetuned"):
fine_tune_model(train_dataset, val_dataset)
else:
print("Fine-tuned model already exists. Skipping fine-tuning.")
run_inference()
if __name__ == "__main__":
main()