Rajut commited on
Commit
bc9b495
·
verified ·
1 Parent(s): 5eaca3e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -0
app.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import GPT2Tokenizer, GPT2LMHeadModel
2
+ from transformers import TextDataset, DataCollatorForLanguageModeling
3
+ from transformers import Trainer, TrainingArguments
4
+ import torch
5
+ import os
6
+ import gradio as gr
7
+
8
+ # Load pre-trained GPT-2 model and tokenizer
9
+ model_name = "gpt2"
10
+ model = GPT2LMHeadModel.from_pretrained(model_name)
11
+ tokenizer = GPT2Tokenizer.from_pretrained(model_name)
12
+
13
+ # Load your preprocessed data
14
+ with open("normans_wikipedia.txt", "r", encoding="utf-8") as file:
15
+ data = file.read()
16
+
17
+ # Specify the output directory for fine-tuned model
18
+ output_dir = "./normans_fine-tuned"
19
+ os.makedirs(output_dir, exist_ok=True)
20
+
21
+ # Tokenize and encode the data
22
+ input_ids = tokenizer.encode(data, return_tensors="pt")
23
+
24
+ # Create a dataset and data collator
25
+ dataset = TextDataset(
26
+ tokenizer=tokenizer,
27
+ file_path="normans_wikipedia.txt",
28
+ block_size=512, # Adjust this value based on your requirements
29
+ )
30
+ data_collator = DataCollatorForLanguageModeling(
31
+ tokenizer=tokenizer,
32
+ mlm=False
33
+ )
34
+
35
+ # Fine-tune the model
36
+ training_args = TrainingArguments(
37
+ output_dir=output_dir,
38
+ overwrite_output_dir=True,
39
+ num_train_epochs=10,
40
+ per_device_train_batch_size=2,
41
+ save_steps=10_000,
42
+ save_total_limit=2,
43
+ logging_dir=output_dir, # Add this line for logging
44
+ logging_steps=100, # Adjust this value based on your requirements
45
+ )
46
+
47
+ trainer = Trainer(
48
+ model=model,
49
+ args=training_args,
50
+ data_collator=data_collator,
51
+ train_dataset=dataset,
52
+ )
53
+
54
+ # Training loop
55
+ try:
56
+ trainer.train()
57
+ except KeyboardInterrupt:
58
+ print("Training interrupted by user.")
59
+
60
+ # Save the fine-tuned model
61
+ model.save_pretrained(output_dir)
62
+ tokenizer.save_pretrained(output_dir)
63
+
64
+ # Load the fine-tuned model
65
+ fine_tuned_model = GPT2LMHeadModel.from_pretrained(output_dir)
66
+
67
+ # Function to generate responses from the fine-tuned model
68
+ def generate_response(user_input):
69
+ # Tokenize and encode user input
70
+ user_input_ids = tokenizer.encode(user_input, return_tensors="pt")
71
+
72
+ # Generate response from the fine-tuned model
73
+ generated_output = fine_tuned_model.generate(
74
+ user_input_ids,
75
+ max_length=100,
76
+ num_beams=5,
77
+ no_repeat_ngram_size=2,
78
+ top_k=50,
79
+ top_p=0.90,
80
+ temperature=0.9
81
+ )
82
+
83
+ # Decode and return the generated response
84
+ chatbot_response = tokenizer.decode(
85
+ generated_output[0], skip_special_tokens=True)
86
+ return "Chatbot: " + chatbot_response
87
+
88
+ # Create a Gradio interface
89
+ iface = gr.Interface(
90
+ fn=generate_response,
91
+ inputs="text",
92
+ outputs="text",
93
+ live=True
94
+ )
95
+
96
+ # Launch the Gradio interface
97
+ iface.launch()