Keane Moraes commited on
Commit
f76e4eb
·
1 Parent(s): 625fc77

threading changes

Browse files
Files changed (1) hide show
  1. app.py +59 -49
app.py CHANGED
@@ -14,6 +14,7 @@ import whisper
14
  import os, json
15
  import math
16
  import re
 
17
 
18
  # Custom classes
19
  from transcription import *
@@ -41,10 +42,12 @@ data_transcription = {"title":"", "text":""}
41
  embeddings = []
42
  text_chunks_lib = dict()
43
  user_input = None
 
44
 
45
  tldr = ""
46
  summary = ""
47
  takeaways = []
 
48
 
49
  folder_name = "./tests"
50
  input_accepted = False
@@ -61,6 +64,47 @@ st.write('It provides a summary, transcription, key insights, a mind map and a Q
61
 
62
  bar = st.progress(0)
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  # =========== SIDEBAR FOR GENERATION ===========
65
  with st.sidebar:
66
  youtube_link = st.text_input(label = "Type in your Youtube link", placeholder = "", key="url")
@@ -81,7 +125,7 @@ with st.sidebar:
81
 
82
  if st.button("Start Analysis"):
83
 
84
- # Check if it is a valid youtube URL
85
  if re.search(REGEXP_YOUTUBE_URL, youtube_link):
86
  vte = VideoTranscription(youtube_link)
87
  YOUTUBE_VIDEO_ID = youtube_link.split("=")[1]
@@ -89,13 +133,10 @@ with st.sidebar:
89
  if not os.path.exists(folder_name):
90
  os.mkdir(folder_name)
91
 
92
- with st.spinner('Running process...'):
93
  data_transcription = vte.transcribe()
94
  segments = data_transcription['segments']
95
-
96
- with open(f"{folder_name}/data.json", "w") as f:
97
- json.dump(data_transcription, f, indent=4)
98
-
99
  # PDF Transcription
100
  elif pdf_file is not None:
101
  pte = PDFTranscription(pdf_file)
@@ -103,7 +144,7 @@ with st.sidebar:
103
  if not os.path.exists(folder_name):
104
  os.mkdir(folder_name)
105
 
106
- with st.spinner('Running process...'):
107
  data_transcription = pte.transcribe()
108
  segments = data_transcription['segments']
109
 
@@ -114,7 +155,7 @@ with st.sidebar:
114
  if not os.path.exists(f""):
115
  os.mkdir(folder_name)
116
 
117
- with st.spinner('Running process...'):
118
  data_transcription = ate.transcribe()
119
  segments = data_transcription['segments']
120
 
@@ -124,49 +165,18 @@ with st.sidebar:
124
  else:
125
  st.error("Please type in your youtube link or upload the PDF")
126
  st.experimental_rerun()
127
-
128
- # Generate embeddings
129
- if not os.path.exists(f"{folder_name}/word_embeddings.csv"):
130
- for i, segment in enumerate(segments):
131
- bar.progress(max(math.ceil((i/len(segments) * 50)), 1))
132
- response = openai.Embedding.create(
133
- input= segment["text"].strip(),
134
- model="text-embedding-ada-002"
135
- )
136
- embeddings = response['data'][0]['embedding']
137
- meta = {
138
- "text": segment["text"].strip(),
139
- "embedding": embeddings
140
- }
141
- data.append(meta)
142
-
143
- pd.DataFrame(data).to_csv(f'{folder_name}/word_embeddings.csv')
144
- else:
145
- data = pd.read_csv(f'{folder_name}/word_embeddings.csv')
146
- embeddings = data["embedding"]
147
 
148
- bar.progress(75)
149
-
150
- text_df = pd.DataFrame.from_dict({"title": [data_transcription["title"]], "text":[data_transcription["text"]]})
151
- input_accepted = True
152
 
153
- with st.spinner('Breaking up the text and doing analysis...'):
154
- # For each body of text, create text chunks of a certain token size required for the transformer
155
- title_entry = text_df['title'][0]
156
- print(title_entry)
157
- for i in range(0, len(text_df)):
158
- nested_sentences = md.create_nest_sentences(document=text_df['text'][i], token_max_length=1024)
159
- # For each chunk of sentences (within the token max)
160
- text_chunks = []
161
- for n in range(0, len(nested_sentences)):
162
- tc = " ".join(map(str, nested_sentences[n]))
163
- text_chunks.append(tc)
164
-
165
- text_chunks_lib[title_entry] = text_chunks
166
-
167
- # Generate key takeaways
168
- key_engine = Keywords(title_entry)
169
- keywords = key_engine.get_keywords(text_chunks_lib)
170
 
171
  # Generate the summary
172
  if gen_summary == 'Yes':
 
14
  import os, json
15
  import math
16
  import re
17
+ from threading import Thread
18
 
19
  # Custom classes
20
  from transcription import *
 
42
  embeddings = []
43
  text_chunks_lib = dict()
44
  user_input = None
45
+ title_entry = None
46
 
47
  tldr = ""
48
  summary = ""
49
  takeaways = []
50
+ keywords = []
51
 
52
  folder_name = "./tests"
53
  input_accepted = False
 
64
 
65
  bar = st.progress(0)
66
 
67
+ def generate_word_embeddings():
68
+ if not os.path.exists(f"{folder_name}/word_embeddings.csv"):
69
+ for i, segment in enumerate(segments):
70
+ bar.progress(max(math.ceil((i/len(segments) * 50)), 1))
71
+ response = openai.Embedding.create(
72
+ input= segment["text"].strip(),
73
+ model="text-embedding-ada-002"
74
+ )
75
+ embeddings = response['data'][0]['embedding']
76
+ meta = {
77
+ "text": segment["text"].strip(),
78
+ "embedding": embeddings
79
+ }
80
+ data.append(meta)
81
+
82
+ pd.DataFrame(data).to_csv(f'{folder_name}/word_embeddings.csv')
83
+ else:
84
+ data = pd.read_csv(f'{folder_name}/word_embeddings.csv')
85
+
86
+
87
+ def generate_text_chunks_lib():
88
+ text_df = pd.DataFrame.from_dict({"title": [data_transcription["title"]], "text":[data_transcription["text"]]})
89
+ input_accepted = True
90
+
91
+ # For each body of text, create text chunks of a certain token size required for the transformer
92
+ title_entry = text_df['title'][0]
93
+ print(title_entry)
94
+ for i in range(0, len(text_df)):
95
+ nested_sentences = md.create_nest_sentences(document=text_df['text'][i], token_max_length=1024)
96
+ # For each chunk of sentences (within the token max)
97
+ text_chunks = []
98
+ for n in range(0, len(nested_sentences)):
99
+ tc = " ".join(map(str, nested_sentences[n]))
100
+ text_chunks.append(tc)
101
+
102
+ text_chunks_lib[title_entry] = text_chunks
103
+
104
+ # Generate key takeaways
105
+ key_engine = Keywords(title_entry)
106
+ keywords = key_engine.get_keywords(text_chunks_lib)
107
+
108
  # =========== SIDEBAR FOR GENERATION ===========
109
  with st.sidebar:
110
  youtube_link = st.text_input(label = "Type in your Youtube link", placeholder = "", key="url")
 
125
 
126
  if st.button("Start Analysis"):
127
 
128
+ # Youtube Transcription
129
  if re.search(REGEXP_YOUTUBE_URL, youtube_link):
130
  vte = VideoTranscription(youtube_link)
131
  YOUTUBE_VIDEO_ID = youtube_link.split("=")[1]
 
133
  if not os.path.exists(folder_name):
134
  os.mkdir(folder_name)
135
 
136
+ with st.spinner('Running transcription...'):
137
  data_transcription = vte.transcribe()
138
  segments = data_transcription['segments']
139
+
 
 
 
140
  # PDF Transcription
141
  elif pdf_file is not None:
142
  pte = PDFTranscription(pdf_file)
 
144
  if not os.path.exists(folder_name):
145
  os.mkdir(folder_name)
146
 
147
+ with st.spinner('Running transcription...'):
148
  data_transcription = pte.transcribe()
149
  segments = data_transcription['segments']
150
 
 
155
  if not os.path.exists(f""):
156
  os.mkdir(folder_name)
157
 
158
+ with st.spinner('Running transcription...'):
159
  data_transcription = ate.transcribe()
160
  segments = data_transcription['segments']
161
 
 
165
  else:
166
  st.error("Please type in your youtube link or upload the PDF")
167
  st.experimental_rerun()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
 
 
 
 
 
169
 
170
+ # Generate embeddings
171
+ thread1 = Thread(target=generate_word_embeddings)
172
+ thread1.start()
173
+ # Generate text chunks
174
+ thread2 = Thread(target=generate_text_chunks_lib)
175
+ thread2.start()
176
+
177
+ # Wait for them to complete
178
+ thread1.join()
179
+ thread2.join()
 
 
 
 
 
 
 
180
 
181
  # Generate the summary
182
  if gen_summary == 'Yes':