son9john commited on
Commit
98cf098
·
1 Parent(s): ac2595c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -91
app.py CHANGED
@@ -3,151 +3,140 @@ import gradio as gr
3
  from gradio.components import Audio, Textbox
4
  import os
5
  import re
 
6
  from transformers import GPT2Tokenizer
7
  import whisper
8
  import pandas as pd
9
  from datetime import datetime, timezone, timedelta
10
  import notion_df
11
- import tiktoken
12
 
13
- openai.api_key = os.environ["OPENAI_API_KEY"]
14
- tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
 
15
 
 
16
  initial_message = {"role": "system", "content": 'You are a USMLE Tutor. Respond with ALWAYS layered "bullet points" (listing rather than sentences) to all input with a fun mneumonics to memorize that list. But you can answer up to 1200 words if the user requests longer response.'}
17
  messages = [initial_message]
18
 
 
19
  answer_count = 0
20
 
21
- # set up whisper model
22
- model = whisper.load_model("base")
23
-
24
- def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301"):
25
- """Returns the number of tokens used by a list of messages."""
26
- try:
27
- encoding = tiktoken.encoding_for_model(model)
28
- except KeyError:
29
- encoding = tiktoken.get_encoding("cl100k_base")
30
- if model == "gpt-3.5-turbo-0301": # note: future models may deviate from this
31
- num_tokens = 0
32
- for message in messages:
33
- num_tokens += 4 # every message follows <im_start>{role/name}\n{content}<im_end>\n
34
- for key, value in message.items():
35
- num_tokens += len(encoding.encode(value))
36
- if key == "name": # if there's a name, the role is omitted
37
- num_tokens += -1 # role is always required and always 1 token
38
- num_tokens += 2 # every reply is primed with <im_start>assistant
39
- return num_tokens
40
- else:
41
- raise NotImplementedError(f"""num_tokens_from_messages() is not presently implemented for model {model}.
42
- See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""")
43
 
44
  def transcribe(audio, text):
45
  global messages
46
  global answer_count
47
- transcript = {'text': ''}
48
- input_text = []
49
  if audio is not None:
50
  audio_file = open(audio, "rb")
51
  transcript = openai.Audio.transcribe("whisper-1", audio_file, language="en")
52
  messages.append({"role": "user", "content": transcript["text"]})
53
 
 
54
  if text is not None:
55
  # Split the input text into sentences
56
  sentences = re.split("(?<=[.!?]) +", text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
- # Tokenize the sentences using the GPT-2 tokenizer
59
- sentence_tokens = [tokenizer.encode(sentence) for sentence in sentences]
60
-
61
- # Flatten the list of tokens
62
- input_tokens = [token for sentence in sentence_tokens for token in sentence]
63
-
64
- # Check if adding the input tokens would exceed the token limit
65
- num_tokens = num_tokens_from_messages(messages)
66
- if num_tokens + len(input_tokens) > 2200:
67
- # Reset the messages list and answer counter
68
- messages = [initial_message]
69
- answer_count = 0
70
- input_text = 'Can you click the Submit button one more time? (say Yes)'
71
-
72
- else:
73
- # Add the input tokens to the messages list
74
- input_text = tokenizer.decode(input_tokens)
75
 
76
- messages.append({"role": "user", "content": transcript["text"] + input_text})
77
-
78
- # Check if the accumulated tokens have exceeded the limit
79
- num_tokens = num_tokens_from_messages(messages)
80
  if num_tokens > 2096:
81
  # Concatenate the chat history
82
- chat_transcript = ""
83
- for message in messages:
84
- if message['role'] != 'system':
85
- chat_transcript += f"[ANSWER {answer_count}]{message['role']}: {message['content']}\n\n"
86
  # Append the number of tokens used to the end of the chat transcript
87
- chat_transcript += f"Number of tokens used: {num_tokens}\n\n"
88
-
89
  # Get the current time in Eastern Time (ET)
90
  now_et = datetime.now(timezone(timedelta(hours=-5)))
91
  # Format the time as string (YY-MM-DD HH:MM)
92
  published_date = now_et.strftime('%m-%d-%y %H:%M')
93
-
94
  # Upload the chat transcript to Notion
95
  df = pd.DataFrame([chat_transcript])
96
- notion_df.upload(df, 'https://www.notion.so/personal-5e3978680ca848bda844452129955138?pvs=4', title=str(published_date), api_key=API_KEY)
97
-
98
  # Reset the messages list and answer counter
99
  messages = [initial_message]
100
  answer_count = 0
101
-
102
- # Increment the answer counter
103
- answer_count += 1
104
 
105
  # Generate the system message using the OpenAI API
106
- system_message = openai.ChatCompletion.create(
107
- model="gpt-3.5-turbo",
108
- messages=messages,
109
- max_tokens=2000
110
- )["choices"][0]["message"]
111
-
112
-
 
 
113
  # Add the system message to the messages list
114
- messages.append({"role": "system", "content": system_message})
115
-
116
  # Concatenate the chat history
117
- chat_transcript = ""
118
- for message in messages:
119
- if message['role'] != 'system':
120
- chat_transcript += f"[ANSWER {answer_count}]{message['role']}: {message['content']}\n\n"
121
 
122
  # Append the number of tokens used to the end of the chat transcript
123
- num_tokens = num_tokens_from_messages(messages)
124
 
125
- # Get the current time in Eastern Time (ET)
126
- now_et = datetime.now(timezone(timedelta(hours=-5)))
127
- # Format the time as string (YY-MM-DD HH:MM)
128
- published_date = now_et.strftime('%m-%d-%y %H:%M')
129
- chat_transcript_copy = chat_transcript
130
- chat_transcript_copy += f"Number of tokens used: {num_tokens}\n\n"
131
 
132
  # Upload the chat transcript to Notion
133
- df = pd.DataFrame([chat_transcript_copy])
134
- notion_df.upload(df, 'https://www.notion.so/personal-5e3978680ca848bda844452129955138?pvs=4', title=str(published_date), api_key=API_KEY)
 
 
135
 
136
  # Return the chat transcript
137
  return chat_transcript
138
-
 
139
  audio_input = Audio(source="microphone", type="filepath", label="Record your message")
140
  text_input = Textbox(label="Type your message", max_length=4096)
141
-
142
  output_text = gr.outputs.Textbox(label="Response")
 
143
 
 
144
  iface = gr.Interface(
145
  fn=transcribe,
146
  inputs=[audio_input, text_input],
147
- outputs="text",
148
- title="YENA",
149
- description="Tutor YENA")
150
-
151
- # Launch Gradio interface
152
- iface.launch()
153
-
 
 
 
 
3
  from gradio.components import Audio, Textbox
4
  import os
5
  import re
6
+ import tiktoken
7
  from transformers import GPT2Tokenizer
8
  import whisper
9
  import pandas as pd
10
  from datetime import datetime, timezone, timedelta
11
  import notion_df
12
+ import concurrent.futures
13
 
14
+ # Define the tokenizer and model
15
+ tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
16
+ model = openai.api_key = os.environ["OPENAI_API_KEY"]
17
 
18
+ # Define the initial message and messages list
19
  initial_message = {"role": "system", "content": 'You are a USMLE Tutor. Respond with ALWAYS layered "bullet points" (listing rather than sentences) to all input with a fun mneumonics to memorize that list. But you can answer up to 1200 words if the user requests longer response.'}
20
  messages = [initial_message]
21
 
22
+ # Define the answer counter
23
  answer_count = 0
24
 
25
+ # Define the Notion API key
26
+ API_KEY = os.environ["API_KEY"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  def transcribe(audio, text):
29
  global messages
30
  global answer_count
31
+
32
+ # Transcribe the audio if provided
33
  if audio is not None:
34
  audio_file = open(audio, "rb")
35
  transcript = openai.Audio.transcribe("whisper-1", audio_file, language="en")
36
  messages.append({"role": "user", "content": transcript["text"]})
37
 
38
+ # Tokenize the text input
39
  if text is not None:
40
  # Split the input text into sentences
41
  sentences = re.split("(?<=[.!?]) +", text)
42
+
43
+ # Initialize a list to store the tokens
44
+ input_tokens = []
45
+
46
+ # Add each sentence to the input_tokens list
47
+ for sentence in sentences:
48
+ # Tokenize the sentence using the GPT-2 tokenizer
49
+ sentence_tokens = tokenizer.encode(sentence)
50
+ # Check if adding the sentence would exceed the token limit
51
+ if len(input_tokens) + len(sentence_tokens) < 1440:
52
+ # Add the sentence tokens to the input_tokens list
53
+ input_tokens.extend(sentence_tokens)
54
+ else:
55
+ # If adding the sentence would exceed the token limit, truncate it
56
+ sentence_tokens = sentence_tokens[:1440-len(input_tokens)]
57
+ input_tokens.extend(sentence_tokens)
58
+ break
59
+ # Decode the input tokens into text
60
+ input_text = tokenizer.decode(input_tokens)
61
+
62
+ # Add the input text to the messages list
63
+ messages.append({"role": "user", "content": input_text})
64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
+ # Check if the accumulated tokens have exceeded 2096
67
+ num_tokens = sum(len(tokenizer.encode(message["content"])) for message in messages)
 
 
68
  if num_tokens > 2096:
69
  # Concatenate the chat history
70
+ chat_transcript = "\n\n".join([f"[ANSWER {answer_count}]{message['role']}: {message['content']}" for message in messages if message['role'] != 'system'])
71
+
 
 
72
  # Append the number of tokens used to the end of the chat transcript
73
+ chat_transcript += f"\n\nNumber of tokens used: {num_tokens}\n\n"
74
+
75
  # Get the current time in Eastern Time (ET)
76
  now_et = datetime.now(timezone(timedelta(hours=-5)))
77
  # Format the time as string (YY-MM-DD HH:MM)
78
  published_date = now_et.strftime('%m-%d-%y %H:%M')
79
+
80
  # Upload the chat transcript to Notion
81
  df = pd.DataFrame([chat_transcript])
82
+ notion_df.upload(df, 'https://www.notion.so/US-62e861a0b35f43da8ef9a7789512b8c2?pvs=4', title=str(published_date), api_key=API_KEY)
83
+
84
  # Reset the messages list and answer counter
85
  messages = [initial_message]
86
  answer_count = 0
87
+ else:
88
+ # Increment the answer counter
89
+ answer_count += 1
90
 
91
  # Generate the system message using the OpenAI API
92
+ with concurrent.futures.ThreadPoolExecutor() as executor:
93
+ prompt = [{"text": f"{message['role']}: {message['content']}\n\n"} for message in messages]
94
+ system_message = openai.ChatCompletion.create(
95
+ model="gpt-3.5-turbo",
96
+ messages=messages,
97
+ max_tokens=2000
98
+ )["choices"][0]["message"]
99
+ # Wait for the completion of the OpenAI API call
100
+
101
  # Add the system message to the messages list
102
+ messages.append(system_message)
103
+
104
  # Concatenate the chat history
105
+ chat_transcript = "\n\n".join([f"[ANSWER {answer_count}]{message['role']}: {message['content']}" for message in messages if message['role'] != 'system'])
 
 
 
106
 
107
  # Append the number of tokens used to the end of the chat transcript
108
+ chat_transcript += f"\n\nNumber of tokens used: {num_tokens}\n\n"
109
 
110
+ # Save the chat transcript to a file
111
+ with open("conversation_history.txt", "a") as f:
112
+ f.write(chat_transcript)
 
 
 
113
 
114
  # Upload the chat transcript to Notion
115
+ now_et = datetime.now(timezone(timedelta(hours=-5)))
116
+ published_date = now_et.strftime('%m-%d-%y %H:%M')
117
+ df = pd.DataFrame([chat_transcript])
118
+ notion_df.upload(df, 'https://www.notion.so/US-62e861a0b35f43da8ef9a7789512b8c2?pvs=4', title=str(published_date), api_key=API_KEY)
119
 
120
  # Return the chat transcript
121
  return chat_transcript
122
+
123
+ # Define the input and output components for Gradio
124
  audio_input = Audio(source="microphone", type="filepath", label="Record your message")
125
  text_input = Textbox(label="Type your message", max_length=4096)
 
126
  output_text = gr.outputs.Textbox(label="Response")
127
+ output_audio = Audio()
128
 
129
+ # Define the Gradio interface
130
  iface = gr.Interface(
131
  fn=transcribe,
132
  inputs=[audio_input, text_input],
133
+ outputs=[output_text],
134
+ title="USMLE Tutor Chatbot",
135
+ description="A chatbot for USMLE test preparation",
136
+ theme="compact",
137
+ layout="vertical",
138
+ allow_flagging=False
139
+ )
140
+
141
+ # Run the Gradio interface
142
+ iface.launch()