Update app.py
Browse files
app.py
CHANGED
@@ -10,6 +10,9 @@ import pandas as pd
|
|
10 |
from datetime import datetime, timezone, timedelta
|
11 |
import notion_df
|
12 |
import concurrent.futures
|
|
|
|
|
|
|
13 |
|
14 |
# Define the tokenizer and model
|
15 |
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
|
@@ -29,106 +32,88 @@ API_KEY = os.environ["API_KEY"]
|
|
29 |
def transcribe(audio, text):
|
30 |
global messages
|
31 |
global answer_count
|
32 |
-
|
|
|
|
|
33 |
transcript = {'text': ''}
|
34 |
input_text = []
|
35 |
|
|
|
36 |
# Transcribe the audio if provided
|
37 |
if audio is not None:
|
38 |
audio_file = open(audio, "rb")
|
39 |
transcript = openai.Audio.transcribe("whisper-1", audio_file, language="en")
|
40 |
messages.append({"role": "user", "content": transcript["text"]})
|
41 |
-
|
42 |
-
# Tokenize the text input
|
43 |
-
if text is not None:
|
44 |
-
# Split the input text into sentences
|
45 |
-
sentences = re.split("(?<=[.!?]) +", text)
|
46 |
|
47 |
-
|
48 |
-
|
49 |
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
#
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
#
|
90 |
-
|
91 |
-
answer_count = 0
|
92 |
-
else:
|
93 |
-
# Increment the answer counter
|
94 |
-
answer_count += 1
|
95 |
-
|
96 |
-
# Generate the system message using the OpenAI API
|
97 |
-
with concurrent.futures.ThreadPoolExecutor() as executor:
|
98 |
-
prompt = [{"text": f"{message['role']}: {message['content']}\n\n"} for message in messages]
|
99 |
system_message = openai.ChatCompletion.create(
|
100 |
model="gpt-3.5-turbo",
|
101 |
messages=messages,
|
102 |
max_tokens=2000
|
103 |
)["choices"][0]["message"]
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
# messages.append(system_message)
|
108 |
-
|
109 |
-
# Add the system message to the beginning of the messages list
|
110 |
-
messages_rev.insert(0, system_message)
|
111 |
-
# Add the input text to the messages list
|
112 |
-
messages_rev.insert(0, {"role": "user", "content": input_text + transcript["text"]})
|
113 |
-
|
114 |
-
# Concatenate the chat history
|
115 |
-
chat_transcript = "\n\n".join([f"[ANSWER {answer_count}]{message['role']}: {message['content']}" for message in messages_rev if message['role'] != 'system'])
|
116 |
-
|
117 |
-
# Append the number of tokens used to the end of the chat transcript
|
118 |
-
chat_transcript += f"\n\nNumber of tokens used: {num_tokens}\n\n"
|
119 |
|
120 |
-
|
121 |
-
|
122 |
-
|
|
|
123 |
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
|
|
|
|
|
|
129 |
|
130 |
# Return the chat transcript
|
131 |
-
return
|
132 |
|
133 |
# Define the input and output components for Gradio
|
134 |
audio_input = Audio(source="microphone", type="filepath", label="Record your message")
|
|
|
10 |
from datetime import datetime, timezone, timedelta
|
11 |
import notion_df
|
12 |
import concurrent.futures
|
13 |
+
from nltk.tokenize import sent_tokenize
|
14 |
+
nltk.download('punkt')
|
15 |
+
|
16 |
|
17 |
# Define the tokenizer and model
|
18 |
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
|
|
|
32 |
def transcribe(audio, text):
|
33 |
global messages
|
34 |
global answer_count
|
35 |
+
messages = [initial_message]
|
36 |
+
messages_rev = [initial_message]
|
37 |
+
|
38 |
transcript = {'text': ''}
|
39 |
input_text = []
|
40 |
|
41 |
+
counter = 0
|
42 |
# Transcribe the audio if provided
|
43 |
if audio is not None:
|
44 |
audio_file = open(audio, "rb")
|
45 |
transcript = openai.Audio.transcribe("whisper-1", audio_file, language="en")
|
46 |
messages.append({"role": "user", "content": transcript["text"]})
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
+
# Split the input text into sentences
|
49 |
+
sentences = sent_tokenize(text)
|
50 |
|
51 |
+
# Split the input text into sub-input tokens based on the condition
|
52 |
+
subinput_tokens = []
|
53 |
+
buffer = []
|
54 |
+
for sentence in sentences:
|
55 |
+
sentence_tokens = tokenizer.encode(sentence)
|
56 |
+
if len(buffer) + len(sentence_tokens) > 800:
|
57 |
+
subinput_tokens.append(buffer)
|
58 |
+
buffer = []
|
59 |
+
buffer.extend(sentence_tokens)
|
60 |
+
if buffer:
|
61 |
+
subinput_tokens.append(buffer)
|
62 |
+
|
63 |
+
chat_transcript = ''
|
64 |
+
|
65 |
+
for tokens in subinput_tokens:
|
66 |
+
# Decode the tokens into text
|
67 |
+
subinput_text = tokenizer.decode(tokens)
|
68 |
+
messages.append({"role": "user", "content": transcript["text"]+str(subinput_text)})
|
69 |
+
|
70 |
+
num_tokens = sum(len(tokenizer.encode(message["content"])) for message in messages)
|
71 |
+
if num_tokens > 2096:
|
72 |
+
# Concatenate the chat history
|
73 |
+
chat_transcript = "\n\n".join([f"[ANSWER {answer_count}]{message['role']}: {message['content']}" for message in messages if message['role'] != 'user'])
|
74 |
+
# Append the number of tokens used to the end of the chat transcript
|
75 |
+
chat_transcript += f"\n\nNumber of tokens used: {num_tokens}\n\n"
|
76 |
+
|
77 |
+
# Get the current time in Eastern Time (ET)
|
78 |
+
now_et = datetime.now(timezone(timedelta(hours=-5)))
|
79 |
+
# Format the time as string (YY-MM-DD HH:MM)
|
80 |
+
published_date = now_et.strftime('%m-%d-%y %H:%M')
|
81 |
+
if counter > 0:
|
82 |
+
# Upload the chat transcript to Notion
|
83 |
+
df = pd.DataFrame([chat_transcript])
|
84 |
+
notion_df.upload(df, 'https://www.notion.so/US-62e861a0b35f43da8ef9a7789512b8c2?pvs=4', title=str(published_date+'FULL'), api_key=API_KEY)
|
85 |
+
counter += 1
|
86 |
+
messages = [{"role": "system", "content": initial_message}]
|
87 |
+
messages = [{"role": "user", "content": subinput_text}]
|
88 |
+
answer_count = 0
|
89 |
+
|
90 |
+
# Generate the system message using the OpenAI API
|
91 |
+
# with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
system_message = openai.ChatCompletion.create(
|
93 |
model="gpt-3.5-turbo",
|
94 |
messages=messages,
|
95 |
max_tokens=2000
|
96 |
)["choices"][0]["message"]
|
97 |
+
|
98 |
+
messages.append({"role": "system", "content": str(system_message['content'])})
|
99 |
+
messages_rev.append({"role": "system", "content": str(system_message['content'])})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
|
101 |
+
# Concatenate the chat history
|
102 |
+
chat_transcript = "\n\n".join([f"[ANSWER {answer_count}]{message['role']}: {message['content']}" for message in messages_rev if message['role'] != 'user'])
|
103 |
+
# if not isinstance(messages[-1]['content'], str):
|
104 |
+
# continue
|
105 |
|
106 |
+
# Append the number of tokens used to the end of the chat transcript
|
107 |
+
chat_transcript += f"\n\nNumber of tokens used: {num_tokens}\n\n"
|
108 |
+
df = pd.DataFrame([chat_transcript])
|
109 |
+
# Get the current time in Eastern Time (ET)
|
110 |
+
now_et = datetime.now(timezone(timedelta(hours=-5)))
|
111 |
+
# Format the time as string (YY-MM-DD HH:MM)
|
112 |
+
published_date = now_et.strftime('%m-%d-%y %H:%M')
|
113 |
+
notion_df.upload(df, 'https://www.notion.so/US-62e861a0b35f43da8ef9a7789512b8c2?pvs=4', title=str(published_date), api_key=API_KEY)
|
114 |
|
115 |
# Return the chat transcript
|
116 |
+
return chat_transcript
|
117 |
|
118 |
# Define the input and output components for Gradio
|
119 |
audio_input = Audio(source="microphone", type="filepath", label="Record your message")
|