File size: 3,037 Bytes
166b8b0
 
72d605a
166b8b0
 
75e0590
f2a3661
166b8b0
75e0590
 
166b8b0
 
 
 
 
1ee4d14
166b8b0
 
 
 
 
1ee4d14
166b8b0
75e0590
166b8b0
 
75e0590
166b8b0
 
 
 
75e0590
 
 
 
 
 
 
166b8b0
75e0590
 
166b8b0
75e0590
166b8b0
75e0590
 
166b8b0
 
 
 
75e0590
166b8b0
75e0590
 
 
166b8b0
cccf288
1ee4d14
166b8b0
75e0590
166b8b0
 
 
75e0590
166b8b0
 
1ee4d14
166b8b0
 
75e0590
166b8b0
 
 
1ee4d14
166b8b0
 
4e6dde8
75e0590
166b8b0
 
 
75e0590
166b8b0
4e6dde8
 
 
adccf9e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import os
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from transformers import pipeline
from sklearn.model_selection import train_test_split

def load_and_preprocess_data():
    dataset = load_dataset('tahiryaqoob/BISELahore')
    train_dataset, val_dataset = train_test_split(dataset['train'], test_size=0.2, random_state=42)
    
    print(f"Training samples: {len(train_dataset)}")
    print(f"Validation samples: {len(val_dataset)}")
    return train_dataset, val_dataset

#Preprocess the data to format for fine-tunin
def preprocess_function(examples, tokenizer):
    inputs = tokenizer(examples['question'], padding="max_length", truncation=True, max_length=128)
    targets = tokenizer(examples['answer'], padding="max_length", truncation=True, max_length=128)
    inputs['labels'] = targets['input_ids']
    return inputs
    
def fine_tune_model(train_dataset, val_dataset):
    model_name = "distilbert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    train_dataset = train_dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)
    val_dataset = val_dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)

    training_args = TrainingArguments(
        output_dir="./distilbert_finetuned",
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=200,
        evaluation_strategy="epoch",
        save_strategy="epoch",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
    )

    trainer.train()

    model.save_pretrained("./distilbert_finetuned")
    tokenizer.save_pretrained("./distilbert_finetuned")
    print("Model fine-tuned and saved successfully.")

#Create a chatbot inference pipeline using the fine-tuned model
def chatbot_inference():
    model_name = "./distilbert_finetuned"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    chatbot = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
    return chatbot

#Run inference to test chatbot functionality
def run_inference():
    chatbot = chatbot_inference()
    user_input = input("Ask a question: ")
    response = chatbot(user_input)
    print("Bot Response:", response[0]['generated_text'])

#Main function to train or serve the chatbot
def main():
    train_dataset, val_dataset = load_and_preprocess_data()

    if not os.path.exists("./distilbert_finetuned"):
        fine_tune_model(train_dataset, val_dataset)
    else:
        print("Fine-tuned model already exists. Skipping fine-tuning.")

    run_inference()

if __name__ == "__main__":
    main()