son9john commited on
Commit
e608162
·
1 Parent(s): c447195

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -81
app.py CHANGED
@@ -10,6 +10,9 @@ import pandas as pd
10
  from datetime import datetime, timezone, timedelta
11
  import notion_df
12
  import concurrent.futures
 
 
 
13
 
14
  # Define the tokenizer and model
15
  tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
@@ -29,106 +32,88 @@ API_KEY = os.environ["API_KEY"]
29
  def transcribe(audio, text):
30
  global messages
31
  global answer_count
32
-
 
 
33
  transcript = {'text': ''}
34
  input_text = []
35
 
 
36
  # Transcribe the audio if provided
37
  if audio is not None:
38
  audio_file = open(audio, "rb")
39
  transcript = openai.Audio.transcribe("whisper-1", audio_file, language="en")
40
  messages.append({"role": "user", "content": transcript["text"]})
41
-
42
- # Tokenize the text input
43
- if text is not None:
44
- # Split the input text into sentences
45
- sentences = re.split("(?<=[.!?]) +", text)
46
 
47
- # Initialize a list to store the tokens
48
- input_tokens = []
49
 
50
- # Add each sentence to the input_tokens list
51
- for sentence in sentences:
52
- # Tokenize the sentence using the GPT-2 tokenizer
53
- sentence_tokens = tokenizer.encode(sentence)
54
- # Check if adding the sentence would exceed the token limit
55
- if len(input_tokens) + len(sentence_tokens) < 1440:
56
- # Add the sentence tokens to the input_tokens list
57
- input_tokens.extend(sentence_tokens)
58
- else:
59
- # If adding the sentence would exceed the token limit, truncate it
60
- sentence_tokens = sentence_tokens[:1440-len(input_tokens)]
61
- input_tokens.extend(sentence_tokens)
62
- break
63
- # Decode the input tokens into text
64
- input_text = tokenizer.decode(input_tokens)
65
- # Add the input text to the messages list
66
- # messages.append({"role": "user", "content": input_text})
67
-
68
- # Add the input text to the messages list
69
- messages.append({"role": "user", "content": transcript["text"]+input_text})
70
-
71
- # Check if the accumulated tokens have exceeded 2096
72
- num_tokens = sum(len(tokenizer.encode(message["content"])) for message in messages)
73
- if num_tokens > 2096:
74
- # Concatenate the chat history
75
- chat_transcript = "\n\n".join([f"[ANSWER {answer_count}]{message['role']}: {message['content']}" for message in messages if message['role'] != 'system'])
76
-
77
- # Append the number of tokens used to the end of the chat transcript
78
- chat_transcript += f"\n\nNumber of tokens used: {num_tokens}\n\n"
79
-
80
- # Get the current time in Eastern Time (ET)
81
- now_et = datetime.now(timezone(timedelta(hours=-5)))
82
- # Format the time as string (YY-MM-DD HH:MM)
83
- published_date = now_et.strftime('%m-%d-%y %H:%M')
84
-
85
- # Upload the chat transcript to Notion
86
- df = pd.DataFrame([chat_transcript])
87
- notion_df.upload(df, 'https://www.notion.so/US-62e861a0b35f43da8ef9a7789512b8c2?pvs=4', title=str(published_date+'FULL'), api_key=API_KEY)
88
-
89
- # Reset the messages list and answer counter
90
- messages = [initial_message]
91
- answer_count = 0
92
- else:
93
- # Increment the answer counter
94
- answer_count += 1
95
-
96
- # Generate the system message using the OpenAI API
97
- with concurrent.futures.ThreadPoolExecutor() as executor:
98
- prompt = [{"text": f"{message['role']}: {message['content']}\n\n"} for message in messages]
99
  system_message = openai.ChatCompletion.create(
100
  model="gpt-3.5-turbo",
101
  messages=messages,
102
  max_tokens=2000
103
  )["choices"][0]["message"]
104
- # Wait for the completion of the OpenAI API call
105
-
106
- # Add the system message to the messages list
107
- # messages.append(system_message)
108
-
109
- # Add the system message to the beginning of the messages list
110
- messages_rev.insert(0, system_message)
111
- # Add the input text to the messages list
112
- messages_rev.insert(0, {"role": "user", "content": input_text + transcript["text"]})
113
-
114
- # Concatenate the chat history
115
- chat_transcript = "\n\n".join([f"[ANSWER {answer_count}]{message['role']}: {message['content']}" for message in messages_rev if message['role'] != 'system'])
116
-
117
- # Append the number of tokens used to the end of the chat transcript
118
- chat_transcript += f"\n\nNumber of tokens used: {num_tokens}\n\n"
119
 
120
- # Save the chat transcript to a file
121
- with open("conversation_history.txt", "a") as f:
122
- f.write(chat_transcript)
 
123
 
124
- # Upload the chat transcript to Notion
125
- now_et = datetime.now(timezone(timedelta(hours=-5)))
126
- published_date = now_et.strftime('%m-%d-%y %H:%M')
127
- df = pd.DataFrame([chat_transcript])
128
- notion_df.upload(df, 'https://www.notion.so/US-62e861a0b35f43da8ef9a7789512b8c2?pvs=4', title=str(published_date), api_key=API_KEY)
 
 
 
129
 
130
  # Return the chat transcript
131
- return system_message['content']
132
 
133
  # Define the input and output components for Gradio
134
  audio_input = Audio(source="microphone", type="filepath", label="Record your message")
 
10
  from datetime import datetime, timezone, timedelta
11
  import notion_df
12
  import concurrent.futures
13
+ from nltk.tokenize import sent_tokenize
14
+ nltk.download('punkt')
15
+
16
 
17
  # Define the tokenizer and model
18
  tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
 
32
  def transcribe(audio, text):
33
  global messages
34
  global answer_count
35
+ messages = [initial_message]
36
+ messages_rev = [initial_message]
37
+
38
  transcript = {'text': ''}
39
  input_text = []
40
 
41
+ counter = 0
42
  # Transcribe the audio if provided
43
  if audio is not None:
44
  audio_file = open(audio, "rb")
45
  transcript = openai.Audio.transcribe("whisper-1", audio_file, language="en")
46
  messages.append({"role": "user", "content": transcript["text"]})
 
 
 
 
 
47
 
48
+ # Split the input text into sentences
49
+ sentences = sent_tokenize(text)
50
 
51
+ # Split the input text into sub-input tokens based on the condition
52
+ subinput_tokens = []
53
+ buffer = []
54
+ for sentence in sentences:
55
+ sentence_tokens = tokenizer.encode(sentence)
56
+ if len(buffer) + len(sentence_tokens) > 800:
57
+ subinput_tokens.append(buffer)
58
+ buffer = []
59
+ buffer.extend(sentence_tokens)
60
+ if buffer:
61
+ subinput_tokens.append(buffer)
62
+
63
+ chat_transcript = ''
64
+
65
+ for tokens in subinput_tokens:
66
+ # Decode the tokens into text
67
+ subinput_text = tokenizer.decode(tokens)
68
+ messages.append({"role": "user", "content": transcript["text"]+str(subinput_text)})
69
+
70
+ num_tokens = sum(len(tokenizer.encode(message["content"])) for message in messages)
71
+ if num_tokens > 2096:
72
+ # Concatenate the chat history
73
+ chat_transcript = "\n\n".join([f"[ANSWER {answer_count}]{message['role']}: {message['content']}" for message in messages if message['role'] != 'user'])
74
+ # Append the number of tokens used to the end of the chat transcript
75
+ chat_transcript += f"\n\nNumber of tokens used: {num_tokens}\n\n"
76
+
77
+ # Get the current time in Eastern Time (ET)
78
+ now_et = datetime.now(timezone(timedelta(hours=-5)))
79
+ # Format the time as string (YY-MM-DD HH:MM)
80
+ published_date = now_et.strftime('%m-%d-%y %H:%M')
81
+ if counter > 0:
82
+ # Upload the chat transcript to Notion
83
+ df = pd.DataFrame([chat_transcript])
84
+ notion_df.upload(df, 'https://www.notion.so/US-62e861a0b35f43da8ef9a7789512b8c2?pvs=4', title=str(published_date+'FULL'), api_key=API_KEY)
85
+ counter += 1
86
+ messages = [{"role": "system", "content": initial_message}]
87
+ messages = [{"role": "user", "content": subinput_text}]
88
+ answer_count = 0
89
+
90
+ # Generate the system message using the OpenAI API
91
+ # with concurrent.futures.ThreadPoolExecutor() as executor:
 
 
 
 
 
 
 
 
92
  system_message = openai.ChatCompletion.create(
93
  model="gpt-3.5-turbo",
94
  messages=messages,
95
  max_tokens=2000
96
  )["choices"][0]["message"]
97
+
98
+ messages.append({"role": "system", "content": str(system_message['content'])})
99
+ messages_rev.append({"role": "system", "content": str(system_message['content'])})
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
+ # Concatenate the chat history
102
+ chat_transcript = "\n\n".join([f"[ANSWER {answer_count}]{message['role']}: {message['content']}" for message in messages_rev if message['role'] != 'user'])
103
+ # if not isinstance(messages[-1]['content'], str):
104
+ # continue
105
 
106
+ # Append the number of tokens used to the end of the chat transcript
107
+ chat_transcript += f"\n\nNumber of tokens used: {num_tokens}\n\n"
108
+ df = pd.DataFrame([chat_transcript])
109
+ # Get the current time in Eastern Time (ET)
110
+ now_et = datetime.now(timezone(timedelta(hours=-5)))
111
+ # Format the time as string (YY-MM-DD HH:MM)
112
+ published_date = now_et.strftime('%m-%d-%y %H:%M')
113
+ notion_df.upload(df, 'https://www.notion.so/US-62e861a0b35f43da8ef9a7789512b8c2?pvs=4', title=str(published_date), api_key=API_KEY)
114
 
115
  # Return the chat transcript
116
+ return chat_transcript
117
 
118
  # Define the input and output components for Gradio
119
  audio_input = Audio(source="microphone", type="filepath", label="Record your message")