Spaces:
Runtime error
Runtime error
File size: 3,037 Bytes
166b8b0 72d605a 166b8b0 75e0590 f2a3661 166b8b0 75e0590 166b8b0 1ee4d14 166b8b0 1ee4d14 166b8b0 75e0590 166b8b0 75e0590 166b8b0 75e0590 166b8b0 75e0590 166b8b0 75e0590 166b8b0 75e0590 166b8b0 75e0590 166b8b0 75e0590 166b8b0 cccf288 1ee4d14 166b8b0 75e0590 166b8b0 75e0590 166b8b0 1ee4d14 166b8b0 75e0590 166b8b0 1ee4d14 166b8b0 4e6dde8 75e0590 166b8b0 75e0590 166b8b0 4e6dde8 adccf9e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
import os
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from transformers import pipeline
from sklearn.model_selection import train_test_split
def load_and_preprocess_data():
dataset = load_dataset('tahiryaqoob/BISELahore')
train_dataset, val_dataset = train_test_split(dataset['train'], test_size=0.2, random_state=42)
print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")
return train_dataset, val_dataset
#Preprocess the data to format for fine-tunin
def preprocess_function(examples, tokenizer):
inputs = tokenizer(examples['question'], padding="max_length", truncation=True, max_length=128)
targets = tokenizer(examples['answer'], padding="max_length", truncation=True, max_length=128)
inputs['labels'] = targets['input_ids']
return inputs
def fine_tune_model(train_dataset, val_dataset):
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
train_dataset = train_dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)
val_dataset = val_dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)
training_args = TrainingArguments(
output_dir="./distilbert_finetuned",
num_train_epochs=3,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
warmup_steps=500,
weight_decay=0.01,
logging_dir='./logs',
logging_steps=200,
evaluation_strategy="epoch",
save_strategy="epoch",
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
tokenizer=tokenizer,
)
trainer.train()
model.save_pretrained("./distilbert_finetuned")
tokenizer.save_pretrained("./distilbert_finetuned")
print("Model fine-tuned and saved successfully.")
#Create a chatbot inference pipeline using the fine-tuned model
def chatbot_inference():
model_name = "./distilbert_finetuned"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
chatbot = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
return chatbot
#Run inference to test chatbot functionality
def run_inference():
chatbot = chatbot_inference()
user_input = input("Ask a question: ")
response = chatbot(user_input)
print("Bot Response:", response[0]['generated_text'])
#Main function to train or serve the chatbot
def main():
train_dataset, val_dataset = load_and_preprocess_data()
if not os.path.exists("./distilbert_finetuned"):
fine_tune_model(train_dataset, val_dataset)
else:
print("Fine-tuned model already exists. Skipping fine-tuning.")
run_inference()
if __name__ == "__main__":
main()
|