yugamj commited on
Commit
405cf46
·
verified ·
1 Parent(s): 5b1e482

updated app.py for v3

Browse files
Files changed (1) hide show
  1. app.py +79 -21
app.py CHANGED
@@ -4,6 +4,7 @@ import pandas as pd
4
  import transformers
5
  from transformers import GPT2LMHeadModel, GPT2Tokenizer
6
  import torch
 
7
  import gradio as gr
8
 
9
 
@@ -22,27 +23,84 @@ tokenizer.add_tokens(["<bot>:"])
22
 
23
 
24
  #Inference function
25
- def infer(inp, history):
26
- inp = "<startofstring>"+inp+"<bot>:"
27
- inp_tok = tokenizer(inp, return_tensors="pt")
28
- X = inp_tok["input_ids"].to(device)
29
- a = inp_tok["attention_mask"].to(device)
30
- output = model.generate(X, attention_mask=a )
31
- output = tokenizer.decode(output[0])
32
- return output[len(inp):]
 
 
 
 
 
 
 
 
33
 
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  #Launch with gradio
36
- gr.ChatInterface(
37
- infer,
38
- chatbot=gr.Chatbot(height=300),
39
- textbox=gr.Textbox(placeholder="Type Here", container=False, scale=10),
40
- title="Finance Chatbot Based on Rich Dad Poor Dad",
41
- description="This Chatbot is Based on a fine-tuned version of 'GPT2'. Popular quotes of Robert Kiyosaki from his book, 'Rich Dad Poor Dad' and book summary were used for training this model.",
42
- theme="soft",
43
- examples=["What do you want to earn more passive income?", "What is the result of people working all their lives for someone else?", "What tells the story of how a person handles money?"],
44
- cache_examples=True,
45
- retry_btn=None,
46
- undo_btn="Delete Previous",
47
- clear_btn="Clear",
48
- ).launch()
 
 
 
 
 
4
  import transformers
5
  from transformers import GPT2LMHeadModel, GPT2Tokenizer
6
  import torch
7
+ import re
8
  import gradio as gr
9
 
10
 
 
23
 
24
 
25
  #Inference function
26
+ def infer_1(inp, chat_history):
27
+ inp_1 = "<startofstring>"+inp+"<bot>:"
28
+ inp_tok = tokenizer(inp_1, return_tensors="pt")
29
+ X = inp_tok["input_ids"].to(device)
30
+ a = inp_tok["attention_mask"].to(device)
31
+ output = model.generate(X, attention_mask=a, max_length=100, num_beams=5,top_k=50)
32
+ output = tokenizer.decode(output[0])
33
+ #Remove the user input part seq
34
+ output = output[len(inp_1):]
35
+ #Remove next line char
36
+ output = re.sub(r'\n', '', output)
37
+ #Remove <endofstring>|<pad> tokens from the end
38
+ output = re.sub(r'<endofstring>|<pad>', '', output)
39
+ #Append to chat history
40
+ chat_history.append((inp, output))
41
+ return '', chat_history
42
 
43
 
44
+ #Ancillary variables
45
+ project_heading = """## Project Description
46
+
47
+ This Chatbot is based on a fine-tuned version of 'GPT2-Small'. Primarily, the text from Robert Kiyosaki's book, "Rich Dad Poor Dad," was processed and used in training.
48
+
49
+ **Note:** This project is built for educational purposes, and it is important to note that the responses generated by the model should not be interpreted literally.
50
+ """
51
+
52
+ sample_questions = ["Where should I invest in to protect my assets?", "What is the result of people working all their lives for someone else?", "Why do you want to earn more passive income?",
53
+ "What inspires winners?", "What do most people fail to realize in life?", "Why do people struggle financially?", "What is the name of your developer?",
54
+ "What do poor people do with their eggs?"]
55
+
56
+ data_processing_story = """
57
+ <h2>Data Collection and Processing</h2>
58
+
59
+ <h3>Step 1: Extracted raw text data from two broad sources</h3>
60
+
61
+ <p>1.1. Digital copy of 'Rich Dad Poor Dad' by Parsing the book, Link: <a href="https://drive.google.com/file/d/1vTrfwcqI5rMVq7CsVEy38aRwPTjN1s6T/view?usp=sharing">Rich Dad Poor Dad Book</a></p>
62
+ <p>1.2. Scraping websites with summary or quotes from the book such as Link: <a href="https://www.goodreads.com/work/quotes/3366043-rich-dad-poor-dad?page=1">Goodreads Quotes</a></p>
63
+
64
+ <h3>Step 2: Data Cleaning and Preprocessing</h3>
65
+
66
+ <p>2.1. Removed irregularities from the text data, e.g., additional comments, quotes, special characters, spaces, indentations, etc.</p>
67
+ <p>2.2. Created batches of text with 512 words each, ascertaining appropriate text boundaries for the next stage of the pipeline</p>
68
+
69
+ <h3>Step 3: Extracted data in the form of question-answer to create a conversational dataset using open source models</h3>
70
+
71
+ <p>3.1. Used FAQ module from QUESTgen.AI's API, which produced question and one-word answer pairs from the given text, Repo Link: <a href="https://github.com/ramsrigouthamg/Questgen.ai">QUESTgen.AI Repository</a></p>
72
+ <p>3.2. Resulting answers didn't carry the complete context and were incomplete, thus arising the need for better and short answers</p>
73
+ <p>3.3. Used the model 'roberta-base-squad2' by Deepset for extracting meaningful answers, Model Card Link: <a href="https://huggingface.co/deepset/roberta-base-squad2">roberta-base-squad2 Model Card</a></p>
74
+
75
+ <h2>Processed Dataset:</h2>
76
+ """
77
+
78
+ model_details_HTML = """<h2>Model Details</h2>
79
+
80
+ <p><strong>Base Model Name:</strong> GPT-2 (Small)</p>
81
+ <p><strong>Training Process:</strong> Fine-tuned using GPT-2 with 23 epochs, 64 batch size, and 100 input tokens</p>
82
+ <p><strong>Source Code:</strong> <a href="https://huggingface.co/spaces/yugamj/AI_text_detector_01/tree/main">GitHub Repository</a></p>
83
+ <p><strong>Base Model Card:</strong> <a href="https://huggingface.co/gpt2">GPT-2 Model Card</a></p>
84
+ <p><strong>Inference Time:</strong> Average response time is approximately 13 seconds</p>
85
+ """
86
+
87
+ training_data = pd.read_csv('ques_ans_v5.csv', index_col = 0)
88
+
89
  #Launch with gradio
90
+ with gr.Blocks() as demo:
91
+ gr.Markdown(value = project_heading)
92
+ #gr.Markdown(value = "This Chatbot is Based on a fine-tuned version of 'GPT2-Small'. Primarily the text from Robert Kiyosaki's book, \"Rich Dad Poor Dad\" was processed and used in training.")
93
+
94
+ chatbot = gr.Chatbot(label = "Trained on scripts from \"Rich Dad Poor Dad\"") #value = [['Hey there User', 'Hey there CB']]
95
+ msg = gr.Textbox(label = 'Press enter to submit!')
96
+ clear = gr.ClearButton([msg, chatbot])
97
+ gr.Examples(sample_questions, msg)
98
+
99
+ msg.submit(infer_1, [msg, chatbot], [msg, chatbot])
100
+
101
+ gr.HTML(data_processing_story)
102
+ gr.DataFrame(training_data[['Question', 'answer_cb']].rename(columns = {'Question':'Question', 'answer_cb':'Answer'}), wrap = True)
103
+
104
+ gr.HTML(model_details_HTML)
105
+
106
+ demo.launch(debug=True)