Spaces:
Sleeping
Sleeping
updated app.py for v3
Browse files
app.py
CHANGED
@@ -4,6 +4,7 @@ import pandas as pd
|
|
4 |
import transformers
|
5 |
from transformers import GPT2LMHeadModel, GPT2Tokenizer
|
6 |
import torch
|
|
|
7 |
import gradio as gr
|
8 |
|
9 |
|
@@ -22,27 +23,84 @@ tokenizer.add_tokens(["<bot>:"])
|
|
22 |
|
23 |
|
24 |
#Inference function
|
25 |
-
def
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
#Launch with gradio
|
36 |
-
gr.
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
|
|
|
|
|
|
|
|
|
4 |
import transformers
|
5 |
from transformers import GPT2LMHeadModel, GPT2Tokenizer
|
6 |
import torch
|
7 |
+
import re
|
8 |
import gradio as gr
|
9 |
|
10 |
|
|
|
23 |
|
24 |
|
25 |
#Inference function
|
26 |
+
def infer_1(inp, chat_history):
|
27 |
+
inp_1 = "<startofstring>"+inp+"<bot>:"
|
28 |
+
inp_tok = tokenizer(inp_1, return_tensors="pt")
|
29 |
+
X = inp_tok["input_ids"].to(device)
|
30 |
+
a = inp_tok["attention_mask"].to(device)
|
31 |
+
output = model.generate(X, attention_mask=a, max_length=100, num_beams=5,top_k=50)
|
32 |
+
output = tokenizer.decode(output[0])
|
33 |
+
#Remove the user input part seq
|
34 |
+
output = output[len(inp_1):]
|
35 |
+
#Remove next line char
|
36 |
+
output = re.sub(r'\n', '', output)
|
37 |
+
#Remove <endofstring>|<pad> tokens from the end
|
38 |
+
output = re.sub(r'<endofstring>|<pad>', '', output)
|
39 |
+
#Append to chat history
|
40 |
+
chat_history.append((inp, output))
|
41 |
+
return '', chat_history
|
42 |
|
43 |
|
44 |
+
#Ancillary variables
|
45 |
+
project_heading = """## Project Description
|
46 |
+
|
47 |
+
This Chatbot is based on a fine-tuned version of 'GPT2-Small'. Primarily, the text from Robert Kiyosaki's book, "Rich Dad Poor Dad," was processed and used in training.
|
48 |
+
|
49 |
+
**Note:** This project is built for educational purposes, and it is important to note that the responses generated by the model should not be interpreted literally.
|
50 |
+
"""
|
51 |
+
|
52 |
+
sample_questions = ["Where should I invest in to protect my assets?", "What is the result of people working all their lives for someone else?", "Why do you want to earn more passive income?",
|
53 |
+
"What inspires winners?", "What do most people fail to realize in life?", "Why do people struggle financially?", "What is the name of your developer?",
|
54 |
+
"What do poor people do with their eggs?"]
|
55 |
+
|
56 |
+
data_processing_story = """
|
57 |
+
<h2>Data Collection and Processing</h2>
|
58 |
+
|
59 |
+
<h3>Step 1: Extracted raw text data from two broad sources</h3>
|
60 |
+
|
61 |
+
<p>1.1. Digital copy of 'Rich Dad Poor Dad' by Parsing the book, Link: <a href="https://drive.google.com/file/d/1vTrfwcqI5rMVq7CsVEy38aRwPTjN1s6T/view?usp=sharing">Rich Dad Poor Dad Book</a></p>
|
62 |
+
<p>1.2. Scraping websites with summary or quotes from the book such as Link: <a href="https://www.goodreads.com/work/quotes/3366043-rich-dad-poor-dad?page=1">Goodreads Quotes</a></p>
|
63 |
+
|
64 |
+
<h3>Step 2: Data Cleaning and Preprocessing</h3>
|
65 |
+
|
66 |
+
<p>2.1. Removed irregularities from the text data, e.g., additional comments, quotes, special characters, spaces, indentations, etc.</p>
|
67 |
+
<p>2.2. Created batches of text with 512 words each, ascertaining appropriate text boundaries for the next stage of the pipeline</p>
|
68 |
+
|
69 |
+
<h3>Step 3: Extracted data in the form of question-answer to create a conversational dataset using open source models</h3>
|
70 |
+
|
71 |
+
<p>3.1. Used FAQ module from QUESTgen.AI's API, which produced question and one-word answer pairs from the given text, Repo Link: <a href="https://github.com/ramsrigouthamg/Questgen.ai">QUESTgen.AI Repository</a></p>
|
72 |
+
<p>3.2. Resulting answers didn't carry the complete context and were incomplete, thus arising the need for better and short answers</p>
|
73 |
+
<p>3.3. Used the model 'roberta-base-squad2' by Deepset for extracting meaningful answers, Model Card Link: <a href="https://huggingface.co/deepset/roberta-base-squad2">roberta-base-squad2 Model Card</a></p>
|
74 |
+
|
75 |
+
<h2>Processed Dataset:</h2>
|
76 |
+
"""
|
77 |
+
|
78 |
+
model_details_HTML = """<h2>Model Details</h2>
|
79 |
+
|
80 |
+
<p><strong>Base Model Name:</strong> GPT-2 (Small)</p>
|
81 |
+
<p><strong>Training Process:</strong> Fine-tuned using GPT-2 with 23 epochs, 64 batch size, and 100 input tokens</p>
|
82 |
+
<p><strong>Source Code:</strong> <a href="https://huggingface.co/spaces/yugamj/AI_text_detector_01/tree/main">GitHub Repository</a></p>
|
83 |
+
<p><strong>Base Model Card:</strong> <a href="https://huggingface.co/gpt2">GPT-2 Model Card</a></p>
|
84 |
+
<p><strong>Inference Time:</strong> Average response time is approximately 13 seconds</p>
|
85 |
+
"""
|
86 |
+
|
87 |
+
training_data = pd.read_csv('ques_ans_v5.csv', index_col = 0)
|
88 |
+
|
89 |
#Launch with gradio
|
90 |
+
with gr.Blocks() as demo:
|
91 |
+
gr.Markdown(value = project_heading)
|
92 |
+
#gr.Markdown(value = "This Chatbot is Based on a fine-tuned version of 'GPT2-Small'. Primarily the text from Robert Kiyosaki's book, \"Rich Dad Poor Dad\" was processed and used in training.")
|
93 |
+
|
94 |
+
chatbot = gr.Chatbot(label = "Trained on scripts from \"Rich Dad Poor Dad\"") #value = [['Hey there User', 'Hey there CB']]
|
95 |
+
msg = gr.Textbox(label = 'Press enter to submit!')
|
96 |
+
clear = gr.ClearButton([msg, chatbot])
|
97 |
+
gr.Examples(sample_questions, msg)
|
98 |
+
|
99 |
+
msg.submit(infer_1, [msg, chatbot], [msg, chatbot])
|
100 |
+
|
101 |
+
gr.HTML(data_processing_story)
|
102 |
+
gr.DataFrame(training_data[['Question', 'answer_cb']].rename(columns = {'Question':'Question', 'answer_cb':'Answer'}), wrap = True)
|
103 |
+
|
104 |
+
gr.HTML(model_details_HTML)
|
105 |
+
|
106 |
+
demo.launch(debug=True)
|