Spaces:

son9john
/

US

Runtime error

App Files Files Community

son9john commited on Mar 13, 2023

Commit

e608162

1 Parent(s): c447195

Update app.py

Browse files

Files changed (1) hide show

app.py +66 -81

app.py CHANGED Viewed

@@ -10,6 +10,9 @@ import pandas as pd
 from datetime import datetime, timezone, timedelta
 import notion_df
 import concurrent.futures
 # Define the tokenizer and model
 tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
@@ -29,106 +32,88 @@ API_KEY = os.environ["API_KEY"]
 def transcribe(audio, text):
     global messages
     global answer_count
     transcript = {'text': ''}
     input_text = []
     # Transcribe the audio if provided
     if audio is not None:
         audio_file = open(audio, "rb")
         transcript = openai.Audio.transcribe("whisper-1", audio_file, language="en")
         messages.append({"role": "user", "content": transcript["text"]})
-    # Tokenize the text input
-    if text is not None:
-        # Split the input text into sentences
-        sentences = re.split("(?<=[.!?]) +", text)
-        # Initialize a list to store the tokens
-        input_tokens = []
-        # Add each sentence to the input_tokens list
-        for sentence in sentences:
-            # Tokenize the sentence using the GPT-2 tokenizer
-            sentence_tokens = tokenizer.encode(sentence)
-            # Check if adding the sentence would exceed the token limit
-            if len(input_tokens) + len(sentence_tokens) < 1440:
-                # Add the sentence tokens to the input_tokens list
-                input_tokens.extend(sentence_tokens)
-            else:
-                # If adding the sentence would exceed the token limit, truncate it
-                sentence_tokens = sentence_tokens[:1440-len(input_tokens)]
-                input_tokens.extend(sentence_tokens)
-                break
-        # Decode the input tokens into text
-        input_text = tokenizer.decode(input_tokens)
-        # Add the input text to the messages list
-        # messages.append({"role": "user", "content": input_text})
-    # Add the input text to the messages list
-    messages.append({"role": "user", "content": transcript["text"]+input_text})
-    # Check if the accumulated tokens have exceeded 2096
-    num_tokens = sum(len(tokenizer.encode(message["content"])) for message in messages)
-    if num_tokens > 2096:
-        # Concatenate the chat history
-        chat_transcript = "\n\n".join([f"[ANSWER {answer_count}]{message['role']}: {message['content']}" for message in messages if message['role'] != 'system'])
-        # Append the number of tokens used to the end of the chat transcript
-        chat_transcript += f"\n\nNumber of tokens used: {num_tokens}\n\n"
-        # Get the current time in Eastern Time (ET)
-        now_et = datetime.now(timezone(timedelta(hours=-5)))
-        # Format the time as string (YY-MM-DD HH:MM)
-        published_date = now_et.strftime('%m-%d-%y %H:%M')
-        # Upload the chat transcript to Notion
-        df = pd.DataFrame([chat_transcript])
-        notion_df.upload(df, 'https://www.notion.so/US-62e861a0b35f43da8ef9a7789512b8c2?pvs=4', title=str(published_date+'FULL'), api_key=API_KEY)
-        # Reset the messages list and answer counter
-        messages = [initial_message]
-        answer_count = 0
-    else:
-        # Increment the answer counter
-        answer_count += 1
-    # Generate the system message using the OpenAI API
-    with concurrent.futures.ThreadPoolExecutor() as executor:
-        prompt = [{"text": f"{message['role']}: {message['content']}\n\n"} for message in messages]
         system_message = openai.ChatCompletion.create(
             model="gpt-3.5-turbo",
             messages=messages,
             max_tokens=2000
         )["choices"][0]["message"]
-    # Wait for the completion of the OpenAI API call
-    # Add the system message to the messages list
-    # messages.append(system_message)
-    # Add the system message to the beginning of the messages list
-    messages_rev.insert(0, system_message)
-    # Add the input text to the messages list
-    messages_rev.insert(0, {"role": "user", "content": input_text + transcript["text"]})
-    # Concatenate the chat history
-    chat_transcript = "\n\n".join([f"[ANSWER {answer_count}]{message['role']}: {message['content']}" for message in messages_rev if message['role'] != 'system'])
-    # Append the number of tokens used to the end of the chat transcript
-    chat_transcript += f"\n\nNumber of tokens used: {num_tokens}\n\n"
-    # Save the chat transcript to a file
-    with open("conversation_history.txt", "a") as f:
-        f.write(chat_transcript)
-    # Upload the chat transcript to Notion
-    now_et = datetime.now(timezone(timedelta(hours=-5)))
-    published_date = now_et.strftime('%m-%d-%y %H:%M')
-    df = pd.DataFrame([chat_transcript])
-    notion_df.upload(df, 'https://www.notion.so/US-62e861a0b35f43da8ef9a7789512b8c2?pvs=4', title=str(published_date), api_key=API_KEY)
     # Return the chat transcript
-    return system_message['content']
 # Define the input and output components for Gradio
 audio_input = Audio(source="microphone", type="filepath", label="Record your message")

 from datetime import datetime, timezone, timedelta
 import notion_df
 import concurrent.futures
+from nltk.tokenize import sent_tokenize
+nltk.download('punkt')
 # Define the tokenizer and model
 tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
 def transcribe(audio, text):
     global messages
     global answer_count
+    messages = [initial_message]
+    messages_rev = [initial_message]
     transcript = {'text': ''}
     input_text = []
+    counter = 0
     # Transcribe the audio if provided
     if audio is not None:
         audio_file = open(audio, "rb")
         transcript = openai.Audio.transcribe("whisper-1", audio_file, language="en")
         messages.append({"role": "user", "content": transcript["text"]})
+    # Split the input text into sentences
+    sentences = sent_tokenize(text)
+    # Split the input text into sub-input tokens based on the condition
+    subinput_tokens = []
+    buffer = []
+    for sentence in sentences:
+        sentence_tokens = tokenizer.encode(sentence)
+        if len(buffer) + len(sentence_tokens) > 800:
+            subinput_tokens.append(buffer)
+            buffer = []
+        buffer.extend(sentence_tokens)
+    if buffer:
+        subinput_tokens.append(buffer)
+    chat_transcript = ''
+    for tokens in subinput_tokens:
+        # Decode the tokens into text
+        subinput_text = tokenizer.decode(tokens)
+        messages.append({"role": "user", "content": transcript["text"]+str(subinput_text)})
+        num_tokens = sum(len(tokenizer.encode(message["content"])) for message in messages)
+        if num_tokens > 2096:
+            # Concatenate the chat history
+            chat_transcript = "\n\n".join([f"[ANSWER {answer_count}]{message['role']}: {message['content']}" for message in messages if message['role'] != 'user'])
+            # Append the number of tokens used to the end of the chat transcript
+            chat_transcript += f"\n\nNumber of tokens used: {num_tokens}\n\n"
+            # Get the current time in Eastern Time (ET)
+            now_et = datetime.now(timezone(timedelta(hours=-5)))
+            # Format the time as string (YY-MM-DD HH:MM)
+            published_date = now_et.strftime('%m-%d-%y %H:%M')
+            if counter > 0:
+                # Upload the chat transcript to Notion
+                df = pd.DataFrame([chat_transcript])
+                notion_df.upload(df, 'https://www.notion.so/US-62e861a0b35f43da8ef9a7789512b8c2?pvs=4', title=str(published_date+'FULL'), api_key=API_KEY)
+            counter += 1
+            messages = [{"role": "system", "content": initial_message}]
+            messages = [{"role": "user", "content": subinput_text}]
+            answer_count = 0
+        # Generate the system message using the OpenAI API
+        # with concurrent.futures.ThreadPoolExecutor() as executor:
         system_message = openai.ChatCompletion.create(
             model="gpt-3.5-turbo",
             messages=messages,
             max_tokens=2000
         )["choices"][0]["message"]
+        messages.append({"role": "system", "content": str(system_message['content'])})
+        messages_rev.append({"role": "system", "content": str(system_message['content'])})
+        # Concatenate the chat history
+        chat_transcript = "\n\n".join([f"[ANSWER {answer_count}]{message['role']}: {message['content']}" for message in messages_rev if message['role'] != 'user'])
+        # if not isinstance(messages[-1]['content'], str):
+        #     continue
+        # Append the number of tokens used to the end of the chat transcript
+        chat_transcript += f"\n\nNumber of tokens used: {num_tokens}\n\n"
+        df = pd.DataFrame([chat_transcript])
+        # Get the current time in Eastern Time (ET)
+        now_et = datetime.now(timezone(timedelta(hours=-5)))
+        # Format the time as string (YY-MM-DD HH:MM)
+        published_date = now_et.strftime('%m-%d-%y %H:%M')
+        notion_df.upload(df, 'https://www.notion.so/US-62e861a0b35f43da8ef9a7789512b8c2?pvs=4', title=str(published_date), api_key=API_KEY)
     # Return the chat transcript
+    return chat_transcript
 # Define the input and output components for Gradio
 audio_input = Audio(source="microphone", type="filepath", label="Record your message")