import os import pandas as pd from datasets import load_dataset from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments from transformers import pipeline from sklearn.model_selection import train_test_split def load_and_preprocess_data(): dataset = load_dataset('tahiryaqoob/BISELahore') train_dataset, val_dataset = train_test_split(dataset['train'], test_size=0.2, random_state=42) print(f"Training samples: {len(train_dataset)}") print(f"Validation samples: {len(val_dataset)}") return train_dataset, val_dataset #Preprocess the data to format for fine-tunin def preprocess_function(examples, tokenizer): inputs = tokenizer(examples['question'], padding="max_length", truncation=True, max_length=128) targets = tokenizer(examples['answer'], padding="max_length", truncation=True, max_length=128) inputs['labels'] = targets['input_ids'] return inputs def fine_tune_model(train_dataset, val_dataset): model_name = "distilbert-base-uncased" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSeq2SeqLM.from_pretrained(model_name) train_dataset = train_dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True) val_dataset = val_dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True) training_args = TrainingArguments( output_dir="./distilbert_finetuned", num_train_epochs=3, per_device_train_batch_size=16, per_device_eval_batch_size=16, warmup_steps=500, weight_decay=0.01, logging_dir='./logs', logging_steps=200, evaluation_strategy="epoch", save_strategy="epoch", ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, tokenizer=tokenizer, ) trainer.train() model.save_pretrained("./distilbert_finetuned") tokenizer.save_pretrained("./distilbert_finetuned") print("Model fine-tuned and saved successfully.") #Create a chatbot inference pipeline using the fine-tuned model def chatbot_inference(): model_name = "./distilbert_finetuned" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSeq2SeqLM.from_pretrained(model_name) chatbot = pipeline("text2text-generation", model=model, tokenizer=tokenizer) return chatbot #Run inference to test chatbot functionality def run_inference(): chatbot = chatbot_inference() user_input = input("Ask a question: ") response = chatbot(user_input) print("Bot Response:", response[0]['generated_text']) #Main function to train or serve the chatbot def main(): train_dataset, val_dataset = load_and_preprocess_data() if not os.path.exists("./distilbert_finetuned"): fine_tune_model(train_dataset, val_dataset) else: print("Fine-tuned model already exists. Skipping fine-tuning.") run_inference() if __name__ == "__main__": main()