son9john commited on
Commit
934b06c
·
1 Parent(s): 8e4b3a8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +192 -101
app.py CHANGED
@@ -15,13 +15,57 @@ from nltk.tokenize import sent_tokenize
15
  nltk.download('punkt')
16
 
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  # Define the tokenizer and model
 
 
 
 
 
 
19
  tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
20
  model = openai.api_key = os.environ["OPENAI_API_KEY"]
21
 
22
  # Define the initial message and messages list
23
- initmessage = 'You are a USMLE Tutor. Respond with ALWAYS layered "bullet points" (listing rather than sentences) to all input with a fun mneumonics to memorize that list. But you can answer up to 1200 words if the user requests longer response.'
24
- initial_message = {"role": "system", "content": 'You are a USMLE Tutor. Respond with ALWAYS layered "bullet points" (listing rather than sentences) to all input with a fun mneumonics to memorize that list. But you can answer up to 1200 words if the user requests longer response.'}
 
25
  messages = [initial_message]
26
  messages_rev = [initial_message]
27
 
@@ -31,138 +75,185 @@ answer_count = 0
31
  # Define the Notion API key
32
  API_KEY = os.environ["API_KEY"]
33
 
34
- def transcribe(audio, text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  global messages
36
  global answer_count
37
- messages = [initial_message]
38
- messages_rev = [initial_message]
39
- chat_transcript = ''
40
-
41
  transcript = {'text': ''}
42
  input_text = []
43
-
44
- counter = 0
45
  # Transcribe the audio if provided
46
  if audio is not None:
47
  audio_file = open(audio, "rb")
48
  transcript = openai.Audio.transcribe("whisper-1", audio_file, language="en")
49
- messages.append({"role": "user", "content": transcript["text"]})
50
- system_message = openai.ChatCompletion.create(
51
- model="gpt-3.5-turbo",
52
- messages=messages,
53
- max_tokens=2000
54
- )["choices"][0]["message"]
55
-
56
- messages.append({"role": "system", "content": str(system_message['content'])})
57
- messages_rev.append({"role": "system", "content": str(system_message['content'])})
58
-
59
- # Concatenate the chat history
60
- chat_transcript = "\n\n".join([f"[ANSWER {answer_count}]{message['role']}: {message['content']}" for message in messages_rev if message['role'] != 'user'])
61
- # if not isinstance(messages[-1]['content'], str):
62
- # continue
63
-
64
- # Append the number of tokens used to the end of the chat transcript
65
- df = pd.DataFrame([chat_transcript])
66
- # Get the current time in Eastern Time (ET)
67
- now_et = datetime.now(timezone(timedelta(hours=-4)))
68
- # Format the time as string (YY-MM-DD HH:MM)
69
- published_date = now_et.strftime('%m-%d-%y %H:%M')
70
- notion_df.upload(df, 'https://www.notion.so/US-My-04095f009651427bb8247b9e680b18e5?pvs=4', title=str(published_date), api_key=API_KEY)
71
-
72
  if text is not None:
73
  # Split the input text into sentences
74
- sentences = sent_tokenize(text)
 
 
 
75
 
76
- # Split the input text into sub-input tokens based on the condition
77
- subinput_tokens = []
78
- buffer = []
79
  for sentence in sentences:
 
80
  sentence_tokens = tokenizer.encode(sentence)
81
- if len(buffer) + len(sentence_tokens) > 400:
82
- subinput_tokens.append(buffer)
83
- buffer = []
84
- buffer.extend(sentence_tokens)
85
- if buffer:
86
- subinput_tokens.append(buffer)
87
- chat_transcript = ""
88
- for tokens in subinput_tokens:
89
- # Decode the tokens into text
90
- subinput_text = tokenizer.decode(tokens)
91
- messages.append({"role": "system", "content": initmessage})
92
- messages.append({"role": "user", "content": transcript["text"]+subinput_text})
93
-
94
- num_tokens = sum(len(tokenizer.encode(message["content"])) for message in messages)
95
- if num_tokens > 1400:
96
- # Concatenate the chat history
97
- chat_transcript = "\n\n".join([f"[ANSWER {answer_count}]{message['role']}: {message['content']}" for message in messages if message['role'] != 'user'])
98
- # Append the number of tokens used to the end of the chat transcript
99
- chat_transcript += f"\n\nNumber of tokens used: {num_tokens}\n\n"
100
-
101
- # Get the current time in Eastern Time (ET)
102
- now_et = datetime.now(timezone(timedelta(hours=-5)))
103
- # Format the time as string (YY-MM-DD HH:MM)
104
- published_date = now_et.strftime('%m-%d-%y %H:%M')
105
- if counter > 0:
106
- # Upload the chat transcript to Notion
107
- df = pd.DataFrame([chat_transcript])
108
- notion_df.upload(df, 'https://www.notion.so/US-My-04095f009651427bb8247b9e680b18e5?pvs=4', title=str(published_date), api_key=API_KEY)
109
-
110
- counter += 1
111
- messages = [{"role": "system", "content": initial_message}]
112
- messages = [{"role": "user", "content": subinput_text}]
113
- answer_count = 0
114
 
115
- # Generate the system message using the OpenAI API
116
- # with concurrent.futures.ThreadPoolExecutor() as executor:
117
- system_message = openai.ChatCompletion.create(
118
- model="gpt-3.5-turbo",
119
- messages=messages,
120
- max_tokens=2000
121
- )["choices"][0]["message"]
122
-
123
- messages.append({"role": "system", "content": str(system_message['content'])})
124
- # messages_rev.append({"role": "system", "content": str(system_message['content'])})
125
- # Add the system message to the beginning of the messages list
126
- messages_rev.insert(0, system_message)
127
- # Add the input text to the messages list
128
- messages_rev.insert(0, {"role": "user", "content": subinput_text + transcript["text"]})
129
-
130
- chat_transcript = f"\n\nNumber of tokens used: {num_tokens}\n\n"
131
 
 
 
 
132
  # Concatenate the chat history
133
- chat_transcript += "\n\n".join([f"[ANSWER {answer_count}]{message['role']}: {message['content']}" for message in messages_rev if message['role'] != 'user'])
134
- # if not isinstance(messages[-1]['content'], str):
135
- # continue
136
-
137
- # Append the number of tokens used to the end of the chat transcript
138
- df = pd.DataFrame([chat_transcript])
139
  # Get the current time in Eastern Time (ET)
140
  now_et = datetime.now(timezone(timedelta(hours=-4)))
141
  # Format the time as string (YY-MM-DD HH:MM)
142
  published_date = now_et.strftime('%m-%d-%y %H:%M')
143
- notion_df.upload(df, 'https://www.notion.so/US-My-04095f009651427bb8247b9e680b18e5?pvs=4', title=str(published_date), api_key=API_KEY)
144
 
145
- # Return the chat transcript
146
- return chat_transcript
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
-
 
 
 
149
  # Define the input and output components for Gradio
150
  audio_input = Audio(source="microphone", type="filepath", label="Record your message")
151
  text_input = Textbox(label="Type your message", max_length=4096)
152
- output_text = gr.outputs.Textbox(label="Response")
153
- output_audio = Audio()
 
154
 
155
  # Define the Gradio interface
156
  iface = gr.Interface(
157
  fn=transcribe,
158
  inputs=[audio_input, text_input],
159
- outputs=[output_text],
160
- title="Hold On, Pain Ends (HOPE) 2",
161
- description="Talk to Your Nephrology Tutor HOPE",
162
  theme="compact",
163
  layout="vertical",
164
  allow_flagging=False
165
- )
 
166
 
167
  # Run the Gradio interface
168
  iface.launch()
 
15
  nltk.download('punkt')
16
 
17
 
18
+ # # Define the tokenizer and model
19
+ # tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
20
+ # model = openai.api_key = os.environ["OPENAI_API_KEY"]
21
+
22
+ # # Define the initial message and messages list
23
+ # initmessage = 'You are a USMLE Tutor. Respond with ALWAYS layered "bullet points" (listing rather than sentences) to all input with a fun mneumonics to memorize that list. But you can answer up to 1200 words if the user requests longer response.'
24
+ # initial_message = {"role": "system", "content": 'You are a USMLE Tutor. Respond with ALWAYS layered "bullet points" (listing rather than sentences) to all input with a fun mneumonics to memorize that list. But you can answer up to 1200 words if the user requests longer response.'}
25
+ # messages = [initial_message]
26
+ # messages_rev = [initial_message]
27
+
28
+ # # Define the answer counter
29
+ # answer_count = 0
30
+
31
+ # # Define the Notion API key
32
+ # API_KEY = os.environ["API_KEY"]
33
+
34
+ import openai
35
+ import gradio as gr
36
+ from gradio.components import Audio, Textbox
37
+ import os
38
+ import re
39
+ import tiktoken
40
+ from transformers import GPT2Tokenizer
41
+ import whisper
42
+ import pandas as pd
43
+ from datetime import datetime, timezone, timedelta
44
+ import notion_df
45
+ import concurrent.futures
46
+ import nltk
47
+ from nltk.tokenize import sent_tokenize
48
+ nltk.download('punkt')
49
+ import spacy
50
+ from spacy import displacy
51
+ from gradio import Markdown
52
+ import threading
53
+
54
+
55
  # Define the tokenizer and model
56
+
57
+ # openai.api_type = "azure"
58
+ # openai.api_base = "https://yena.openai.azure.com/"
59
+ # openai.api_version = "2022-12-01"
60
+
61
+
62
  tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
63
  model = openai.api_key = os.environ["OPENAI_API_KEY"]
64
 
65
  # Define the initial message and messages list
66
+ initmessage = 'You are a MCAT Tutor. Respond with ALWAYS layered "bullet points" (listing rather than sentences) to all input with a fun mneumonics to memorize that list. But you can answer up to 1200 words if the user requests longer response.'
67
+ initial_message = {"role": "system", "content": 'You are a MCAT Tutor. Pay especially attention to "testable" or "exam," or any related terms in the input and highlight them as "EXAM TOPIC." Respond ALWAYS quiz me with high yield and relevant qustions on the input and the answers layed out with layered "bullet points" (listing rather than sentences) to all input with a fun mneumonics to memorize that list. Expand on each point with great detail lists not sentence.'}
68
+
69
  messages = [initial_message]
70
  messages_rev = [initial_message]
71
 
 
75
  # Define the Notion API key
76
  API_KEY = os.environ["API_KEY"]
77
 
78
+ # Define the answer counter
79
+ answer_count = 0
80
+
81
+ nlp = spacy.load("en_core_web_sm")
82
+ def process_nlp(system_message):
83
+ # Colorize the system message text
84
+ colorized_text = colorize_text(system_message['content'])
85
+ return colorized_text
86
+
87
+ def colorize_text(text):
88
+ colorized_text = ""
89
+ lines = text.split("\n")
90
+
91
+ for line in lines:
92
+ doc = nlp(line)
93
+ for token in doc:
94
+ if token.ent_type_:
95
+ colorized_text += f'**{token.text_with_ws}**'
96
+ elif token.pos_ == 'NOUN':
97
+ colorized_text += f'<span style="color: #FF3300; background-color: transparent;">{token.text_with_ws}</span>'
98
+ elif token.pos_ == 'VERB':
99
+ colorized_text += f'<span style="color: #FFFF00; background-color: transparent;">{token.text_with_ws}</span>'
100
+ elif token.pos_ == 'ADJ':
101
+ colorized_text += f'<span style="color: #00CC00; background-color: transparent;">{token.text_with_ws}</span>'
102
+ elif token.pos_ == 'ADV':
103
+ colorized_text += f'<span style="color: #FF6600; background-color: transparent;">{token.text_with_ws}</span>'
104
+ elif token.is_digit:
105
+ colorized_text += f'<span style="color: #9900CC; background-color: transparent;">{token.text_with_ws}</span>'
106
+ elif token.is_punct:
107
+ colorized_text += f'<span style="color: #8B4513; background-color: transparent;">{token.text_with_ws}</span>'
108
+ elif token.is_quote:
109
+ colorized_text += f'<span style="color: #008080; background-color: transparent;">{token.text_with_ws}</span>'
110
+ else:
111
+ colorized_text += token.text_with_ws
112
+ colorized_text += "<br>"
113
+
114
+ return colorized_text
115
+
116
+
117
+ def colorize_and_update(system_message, submit_update):
118
+ colorized_system_message = colorize_text(system_message['content'])
119
+ submit_update(None, colorized_system_message) # Pass the colorized_system_message as the second output
120
+
121
+ def update_text_output(system_message, submit_update):
122
+ submit_update(system_message['content'], None)
123
+
124
+
125
+ def transcribe(audio, text, submit_update=None):
126
+
127
  global messages
128
  global answer_count
 
 
 
 
129
  transcript = {'text': ''}
130
  input_text = []
 
 
131
  # Transcribe the audio if provided
132
  if audio is not None:
133
  audio_file = open(audio, "rb")
134
  transcript = openai.Audio.transcribe("whisper-1", audio_file, language="en")
135
+
136
+ # Tokenize the text input
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  if text is not None:
138
  # Split the input text into sentences
139
+ sentences = re.split("(?<=[.!?]) +", text)
140
+
141
+ # Initialize a list to store the tokens
142
+ input_tokens = []
143
 
144
+ # Add each sentence to the input_tokens list
 
 
145
  for sentence in sentences:
146
+ # Tokenize the sentence using the GPT-2 tokenizer
147
  sentence_tokens = tokenizer.encode(sentence)
148
+ # Check if adding the sentence would exceed the token limit
149
+ if len(input_tokens) + len(sentence_tokens) < 1440:
150
+ # Add the sentence tokens to the input_tokens list
151
+ input_tokens.extend(sentence_tokens)
152
+ else:
153
+ # If adding the sentence would exceed the token limit, truncate it
154
+ sentence_tokens = sentence_tokens[:1440-len(input_tokens)]
155
+ input_tokens.extend(sentence_tokens)
156
+ break
157
+ # Decode the input tokens into text
158
+ input_text = tokenizer.decode(input_tokens)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
160
+ # Add the input text to the messages list
161
+ messages.append({"role": "user", "content": transcript["text"]+input_text})
162
+
 
 
 
 
 
 
 
 
 
 
 
 
 
163
 
164
+ # Check if the accumulated tokens have exceeded 2096
165
+ num_tokens = sum(len(tokenizer.encode(message["content"])) for message in messages)
166
+ if num_tokens > 2096:
167
  # Concatenate the chat history
168
+ chat_transcript = "\n\n".join([f"[ANSWER {answer_count}]{message['role']}: {message['content']}" for message in messages if message['role'] != 'system'])
169
+
170
+ # Append the number of tokens used to the end of the chat transcriptd
171
+ chat_transcript += f"\n\nNumber of tokens used: {num_tokens}\n\n"
172
+
 
173
  # Get the current time in Eastern Time (ET)
174
  now_et = datetime.now(timezone(timedelta(hours=-4)))
175
  # Format the time as string (YY-MM-DD HH:MM)
176
  published_date = now_et.strftime('%m-%d-%y %H:%M')
 
177
 
178
+ # Upload the chat transcript to Notion
179
+ df = pd.DataFrame([chat_transcript])
180
+ notion_df.upload(df, 'https://www.notion.so/YENA-be569d0a40c940e7b6e0679318215790?pvs=4', title=str(published_date+'back_up'), api_key=API_KEY)
181
+
182
+ # Reset the messages list and answer counter
183
+ messages = [initial_message]
184
+ messages.append({"role": "user", "content": initmessage})
185
+ answer_count = 0
186
+ # Add the input text to the messages list
187
+ messages.append({"role": "user", "content": input_text})
188
+ else:
189
+ # Increment the answer counter
190
+ answer_count += 1
191
+
192
+ # Generate the system message using the OpenAI API
193
+ with concurrent.futures.ThreadPoolExecutor() as executor:
194
+ prompt = [{"text": f"{message['role']}: {message['content']}\n\n"} for message in messages]
195
+ system_message = openai.ChatCompletion.create(
196
+ model="gpt-3.5-turbo",
197
+ messages=messages,
198
+ max_tokens=2000
199
+ )["choices"][0]["message"]
200
+
201
+ # Immediately update the text output
202
+ if submit_update: # Check if submit_update is not None
203
+ update_text_output(system_message, submit_update)
204
+
205
+ # Add the system message to the messages list
206
+ messages.append(system_message)
207
+
208
+ # Add the system message to the beginning of the messages list
209
+ messages_rev.insert(0, system_message)
210
+ # Add the input text to the messages list
211
+ messages_rev.insert(0, {"role": "user", "content": input_text + transcript["text"]})
212
+
213
+ # Start a separate thread to process the colorization and update the Gradio interface
214
+ if submit_update: # Check if submit_update is not None
215
+ colorize_thread = threading.Thread(target=colorize_and_update, args=(system_message, submit_update))
216
+ colorize_thread.start()
217
+
218
+ # Return the system message immediately
219
+
220
+ chat_transcript = system_message['content']
221
+
222
+ # with open("./MSK_PS_conversation_history.txt", "a") as f:
223
+ # f.write(chat_transcript)
224
+
225
+ # Get the current time in Eastern Time (ET)
226
+ now_et = datetime.now(timezone(timedelta(hours=-4)))
227
+ # Format the time as string (YY-MM-DD HH:MM)
228
+ published_date = now_et.strftime('%m-%d-%y %H:%M')
229
+
230
+ # Upload the chat transcript to Notion
231
+ df = pd.DataFrame([chat_transcript])
232
+ notion_df.upload(df, 'https://www.notion.so/YENA-be569d0a40c940e7b6e0679318215790?pvs=4', title=str(published_date+'back_up'), api_key=API_KEY)
233
 
234
+ return system_message['content'], colorize_text(system_message['content'])
235
+
236
+
237
+
238
  # Define the input and output components for Gradio
239
  audio_input = Audio(source="microphone", type="filepath", label="Record your message")
240
  text_input = Textbox(label="Type your message", max_length=4096)
241
+ # Define the input and output components for Gradio
242
+ output_text = Textbox(label="Text Output")
243
+ output_html = Markdown()
244
 
245
  # Define the Gradio interface
246
  iface = gr.Interface(
247
  fn=transcribe,
248
  inputs=[audio_input, text_input],
249
+ outputs=[output_text, output_html], # Add both output components
250
+ title="Hold On, Pain Ends (HOPE)",
251
+ description="Talk to Your USMLE Tutor HOPE",
252
  theme="compact",
253
  layout="vertical",
254
  allow_flagging=False
255
+ )
256
+
257
 
258
  # Run the Gradio interface
259
  iface.launch()