Spaces:
Sleeping
Sleeping
lordvader31
commited on
Commit
·
79b94f8
1
Parent(s):
499c2f5
major update from bitbucket
Browse files- app.py +366 -0
- classifier.py +0 -0
- keywords.py +28 -0
- mindmap.py +50 -0
- models.py +93 -0
- prompts/mindmap.prompt +11 -0
- summary.py +55 -0
- takeaways.py +51 -0
- transcription.py +302 -0
app.py
ADDED
@@ -0,0 +1,366 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Streamlit classes
|
2 |
+
import streamlit as st
|
3 |
+
from streamlit_agraph import agraph, Node, Edge, Config
|
4 |
+
from streamlit_chat import message
|
5 |
+
|
6 |
+
# Data manipulation and embeddings
|
7 |
+
import pandas as pd
|
8 |
+
import numpy as np
|
9 |
+
import openai
|
10 |
+
from openai.embeddings_utils import distances_from_embeddings
|
11 |
+
import whisper
|
12 |
+
|
13 |
+
# Exec tasks
|
14 |
+
import os, json
|
15 |
+
import math
|
16 |
+
import re
|
17 |
+
|
18 |
+
# Custom classes
|
19 |
+
from transcription import *
|
20 |
+
from keywords import Keywords
|
21 |
+
from summary import TextSummarizer
|
22 |
+
from takeaways import KeyTakeaways
|
23 |
+
from mindmap import MindMap
|
24 |
+
import models as md
|
25 |
+
|
26 |
+
|
27 |
+
REGEXP_YOUTUBE_URL = "^(https?\:\/\/)?((www\.)?youtube\.com|youtu\.be)\/.+$"
|
28 |
+
|
29 |
+
model = whisper.load_model('base')
|
30 |
+
|
31 |
+
output = ''
|
32 |
+
data = []
|
33 |
+
data_transcription = {"title":"", "text":""}
|
34 |
+
embeddings = []
|
35 |
+
text_chunks_lib = dict()
|
36 |
+
user_input = None
|
37 |
+
|
38 |
+
tldr = ""
|
39 |
+
summary = ""
|
40 |
+
takeaways = []
|
41 |
+
|
42 |
+
folder_name = "./tests"
|
43 |
+
input_accepted = False
|
44 |
+
is_completed_analysis = False
|
45 |
+
|
46 |
+
def get_initial_message():
|
47 |
+
messages=[
|
48 |
+
{"role": "system", "content": "You are a helpful AI Tutor. Who anwers brief questions about AI."},
|
49 |
+
{"role": "user", "content": "I want to learn AI"},
|
50 |
+
{"role": "assistant", "content": "Thats awesome, what do you want to know aboout AI"}
|
51 |
+
]
|
52 |
+
return messages
|
53 |
+
|
54 |
+
nodes = []
|
55 |
+
edges = []
|
56 |
+
|
57 |
+
nodes.append( Node(id="Spiderman",
|
58 |
+
label="Peter Parker",
|
59 |
+
size=25,
|
60 |
+
shape="circularImage",
|
61 |
+
image="http://marvel-force-chart.surge.sh/marvel_force_chart_img/top_spiderman.png")
|
62 |
+
) # includes **kwargs
|
63 |
+
nodes.append( Node(id="Captain_Marvel",
|
64 |
+
size=25,
|
65 |
+
shape="circularImage",
|
66 |
+
image="http://marvel-force-chart.surge.sh/marvel_force_chart_img/top_captainmarvel.png")
|
67 |
+
)
|
68 |
+
edges.append( Edge(source="Captain_Marvel",
|
69 |
+
label="friend_of",
|
70 |
+
target="Spiderman",
|
71 |
+
)
|
72 |
+
)
|
73 |
+
|
74 |
+
config = Config(width=750,
|
75 |
+
height=950,
|
76 |
+
directed=True,
|
77 |
+
physics=True,
|
78 |
+
hierarchical=False,
|
79 |
+
)
|
80 |
+
|
81 |
+
|
82 |
+
user_secret = os.getenv("OPENAI_API_KEY")
|
83 |
+
|
84 |
+
# Define the purpose of the application
|
85 |
+
st.header('Almithal')
|
86 |
+
st.subheader('Almithal is a comprehensive video and PDF study buddy.')
|
87 |
+
st.write('It provides a summary, transcription, key insights, a mind map and a Q&A feature where you can actually "talk" to the datasource.')
|
88 |
+
|
89 |
+
bar = st.progress(0)
|
90 |
+
|
91 |
+
# =========== SIDEBAR FOR GENERATION ===========
|
92 |
+
with st.sidebar:
|
93 |
+
youtube_link = st.text_input(label = "Type in your Youtube link", placeholder = "", key="url")
|
94 |
+
st.markdown("OR")
|
95 |
+
pdf_file = st.file_uploader("Upload your PDF", type="pdf")
|
96 |
+
st.markdown("OR")
|
97 |
+
audio_file = st.file_uploader("Upload your MP3 audio file", type=["wav", "mp3"])
|
98 |
+
|
99 |
+
gen_keywords = st.radio(
|
100 |
+
"Generate keywords from text?",
|
101 |
+
('Yes', 'No')
|
102 |
+
)
|
103 |
+
|
104 |
+
gen_summary = st.radio(
|
105 |
+
"Generate summary from text? (recommended for label matching below, but will take longer)",
|
106 |
+
('Yes', 'No')
|
107 |
+
)
|
108 |
+
|
109 |
+
if st.button("Start Analysis"):
|
110 |
+
|
111 |
+
# Check if it is a valid youtube URL
|
112 |
+
if re.search(REGEXP_YOUTUBE_URL, youtube_link):
|
113 |
+
vte = VideoTranscription(youtube_link)
|
114 |
+
YOUTUBE_VIDEO_ID = youtube_link.split("=")[1]
|
115 |
+
folder_name = f"./tests/{YOUTUBE_VIDEO_ID}"
|
116 |
+
if not os.path.exists(folder_name):
|
117 |
+
os.mkdir(folder_name)
|
118 |
+
|
119 |
+
with st.spinner('Running process...'):
|
120 |
+
data_transcription = vte.transcribe()
|
121 |
+
segments = data_transcription['segments']
|
122 |
+
|
123 |
+
with open(f"{folder_name}/data.json", "w") as f:
|
124 |
+
json.dump(data_transcription, f, indent=4)
|
125 |
+
|
126 |
+
# PDF Transcription
|
127 |
+
elif pdf_file is not None:
|
128 |
+
pte = PDFTranscription(pdf_file)
|
129 |
+
folder_name = pte.get_redacted_name()
|
130 |
+
if not os.path.exists(folder_name):
|
131 |
+
os.mkdir(folder_name)
|
132 |
+
|
133 |
+
with st.spinner('Running process...'):
|
134 |
+
data_transcription = pte.transcribe()
|
135 |
+
segments = data_transcription['segments']
|
136 |
+
|
137 |
+
# Audio transcription
|
138 |
+
elif audio_file is not None:
|
139 |
+
ate = AudioTranscription(audio_file)
|
140 |
+
folder_name = ate.get_redacted_name()
|
141 |
+
if not os.path.exists(f""):
|
142 |
+
os.mkdir(folder_name)
|
143 |
+
|
144 |
+
with st.spinner('Running process...'):
|
145 |
+
data_transcription = ate.transcribe()
|
146 |
+
segments = data_transcription['segments']
|
147 |
+
|
148 |
+
with open(f"{folder_name}/data.json", "w") as f:
|
149 |
+
json.dump(data_transcription, f, indent=4)
|
150 |
+
|
151 |
+
else:
|
152 |
+
st.error("Please type in your youtube link or upload the PDF")
|
153 |
+
st.experimental_rerun()
|
154 |
+
|
155 |
+
# Save the transcript information
|
156 |
+
with open(f"{folder_name}/data_transcription.json", "w") as f:
|
157 |
+
json.dump(data_transcription, f, indent=4)
|
158 |
+
|
159 |
+
# Generate embeddings
|
160 |
+
if not os.path.exists(f"{folder_name}/word_embeddings.csv"):
|
161 |
+
for i, segment in enumerate(segments):
|
162 |
+
bar.progress(max(math.ceil((i/len(segments) * 50)), 1))
|
163 |
+
response = openai.Embedding.create(
|
164 |
+
input= segment["text"].strip(),
|
165 |
+
model="text-embedding-ada-002"
|
166 |
+
)
|
167 |
+
embeddings = response['data'][0]['embedding']
|
168 |
+
meta = {
|
169 |
+
"text": segment["text"].strip(),
|
170 |
+
"embedding": embeddings
|
171 |
+
}
|
172 |
+
data.append(meta)
|
173 |
+
|
174 |
+
pd.DataFrame(data).to_csv(f'{folder_name}/word_embeddings.csv')
|
175 |
+
else:
|
176 |
+
data = pd.read_csv(f'{folder_name}/word_embeddings.csv')
|
177 |
+
embeddings = data["embedding"]
|
178 |
+
|
179 |
+
bar.progress(75)
|
180 |
+
|
181 |
+
text_df = pd.DataFrame.from_dict({"title": [data_transcription["title"]], "text":[data_transcription["text"]]})
|
182 |
+
input_accepted = True
|
183 |
+
|
184 |
+
with st.spinner('Breaking up the text and doing analysis...'):
|
185 |
+
# For each body of text, create text chunks of a certain token size required for the transformer
|
186 |
+
title_entry = text_df['title'][0]
|
187 |
+
print(title_entry)
|
188 |
+
for i in range(0, len(text_df)):
|
189 |
+
nested_sentences = md.create_nest_sentences(document=text_df['text'][i], token_max_length=1024)
|
190 |
+
# For each chunk of sentences (within the token max)
|
191 |
+
text_chunks = []
|
192 |
+
for n in range(0, len(nested_sentences)):
|
193 |
+
tc = " ".join(map(str, nested_sentences[n]))
|
194 |
+
text_chunks.append(tc)
|
195 |
+
|
196 |
+
text_chunks_lib[title_entry] = text_chunks
|
197 |
+
|
198 |
+
# Generate key takeaways
|
199 |
+
key_engine = Keywords(title_entry)
|
200 |
+
keywords = key_engine.get_keywords(text_chunks_lib)
|
201 |
+
|
202 |
+
# Generate the summary
|
203 |
+
if gen_summary == 'Yes':
|
204 |
+
se = TextSummarizer(title_entry)
|
205 |
+
text_transcription = data_transcription['text']
|
206 |
+
with st.spinner("Generating summary and TLDR..."):
|
207 |
+
summary = se.generate_full_summary(text_chunks_lib)
|
208 |
+
summary_list = summary.split("\n\n")
|
209 |
+
tldr = se.generate_short_summary(summary_list)
|
210 |
+
|
211 |
+
# Generate key takeaways
|
212 |
+
kt = KeyTakeaways()
|
213 |
+
with st.spinner("Generating key takeaways ... "):
|
214 |
+
takeaways = kt.generate_key_takeaways(text_chunks_lib)
|
215 |
+
|
216 |
+
is_completed_analysis = True
|
217 |
+
bar.progress(100)
|
218 |
+
|
219 |
+
if is_completed_analysis:
|
220 |
+
st.header("Key Takeaways")
|
221 |
+
st.write("Here are some of the key takeaways from the data:")
|
222 |
+
for takeaway in takeaways:
|
223 |
+
st.markdown(f"- {takeaway}")
|
224 |
+
|
225 |
+
|
226 |
+
tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs(["Introduction", "Summary", "Transcription", "Mind Map", "Keywords", "Q&A"])
|
227 |
+
|
228 |
+
# =========== INTRODUCTION ===========
|
229 |
+
with tab1:
|
230 |
+
st.subheader("Introduction")
|
231 |
+
st.markdown("## How do I use this?")
|
232 |
+
st.markdown("Do one of the following")
|
233 |
+
st.markdown('* Type in your youtube URL that you want worked on')
|
234 |
+
st.markdown('* Place the PDF file that you want worked on')
|
235 |
+
st.markdown("**Once the file / url has finished saving, a 'Start Analysis' button will appear. Click on this button to begin the note generation**")
|
236 |
+
st.warning("NOTE: This is just a demo product in alpha testing. Any and all bugs will soon be fixed")
|
237 |
+
st.warning("After the note taking is done, you will see multiple tabs for more information")
|
238 |
+
|
239 |
+
# =========== SUMMARIZATION ===========
|
240 |
+
with tab2:
|
241 |
+
if is_completed_analysis:
|
242 |
+
st.header("TL;DR")
|
243 |
+
for point in tldr:
|
244 |
+
st.markdown(f"- {point}")
|
245 |
+
st.header("Summary")
|
246 |
+
st.write(summary)
|
247 |
+
else:
|
248 |
+
st.warning("Please wait for the analysis to finish")
|
249 |
+
|
250 |
+
# =========== TRANSCRIPTION ===========
|
251 |
+
with tab3:
|
252 |
+
st.header("Transcription")
|
253 |
+
if is_completed_analysis:
|
254 |
+
with st.spinner("Generating transcript ..."):
|
255 |
+
st.write("")
|
256 |
+
for text in text_chunks_lib[title_entry]:
|
257 |
+
st.write(text)
|
258 |
+
else:
|
259 |
+
st.warning("Please wait for the analysis to finish")
|
260 |
+
|
261 |
+
# =========== MIND MAP ===========
|
262 |
+
with tab4:
|
263 |
+
st.header("Mind Map")
|
264 |
+
if is_completed_analysis:
|
265 |
+
mindmap = MindMap()
|
266 |
+
with st.spinner("Generating mind map..."):
|
267 |
+
mindmap.generate_graph(text_chunks_lib)
|
268 |
+
else:
|
269 |
+
st.warning("Please wait for the analysis to finish")
|
270 |
+
|
271 |
+
# =========== KEYWORDS ===========
|
272 |
+
with tab5:
|
273 |
+
st.header("Keywords:")
|
274 |
+
if is_completed_analysis and gen_keywords:
|
275 |
+
for i, keyword in enumerate(keywords):
|
276 |
+
st.markdown(f"{i+1}. {keyword}")
|
277 |
+
else:
|
278 |
+
st.warning("Please wait for the analysis to finish")
|
279 |
+
|
280 |
+
# =========== QUERY BOT ===========
|
281 |
+
with tab6:
|
282 |
+
if 'generated' not in st.session_state:
|
283 |
+
st.session_state['generated'] = []
|
284 |
+
|
285 |
+
if 'past' not in st.session_state:
|
286 |
+
st.session_state['past'] = []
|
287 |
+
|
288 |
+
def get_text():
|
289 |
+
st.header("Ask me something about the video:")
|
290 |
+
input_text = st.text_input("You: ", key="prompt")
|
291 |
+
return input_text
|
292 |
+
|
293 |
+
|
294 |
+
def get_embedding_text(prompt):
|
295 |
+
response = openai.Embedding.create(
|
296 |
+
input= prompt.strip(),
|
297 |
+
model="text-embedding-ada-002"
|
298 |
+
)
|
299 |
+
q_embedding = response['data'][0]['embedding']
|
300 |
+
print("the folder name at got here 1.5 is ", folder_name)
|
301 |
+
df = pd.read_csv(f'{folder_name}/word_embeddings.csv', index_col=0)
|
302 |
+
df['embedding'] = df['embedding'].apply(eval).apply(np.array)
|
303 |
+
|
304 |
+
df['distances'] = distances_from_embeddings(q_embedding, df['embedding'].values, distance_metric='cosine')
|
305 |
+
returns = []
|
306 |
+
|
307 |
+
# Sort by distance with 2 hints
|
308 |
+
for i, row in df.sort_values('distances', ascending=True).head(4).iterrows():
|
309 |
+
# Else add it to the text that is being returned
|
310 |
+
returns.append(row["text"])
|
311 |
+
|
312 |
+
# Return the context
|
313 |
+
return "\n\n###\n\n".join(returns)
|
314 |
+
|
315 |
+
def generate_response(prompt):
|
316 |
+
one_shot_prompt = '''
|
317 |
+
I am YoutubeGPT, a highly intelligent question answering bot.
|
318 |
+
If you ask me a question that is rooted in truth, I will give you the answer.
|
319 |
+
Q: What is human life expectancy in the United States?
|
320 |
+
A: Human life expectancy in the United States is 78 years.
|
321 |
+
Q: '''+prompt+'''
|
322 |
+
A:
|
323 |
+
'''
|
324 |
+
completions = openai.Completion.create(
|
325 |
+
engine = "text-davinci-003",
|
326 |
+
prompt = one_shot_prompt,
|
327 |
+
max_tokens = 1024,
|
328 |
+
n = 1,
|
329 |
+
stop=["Q:"],
|
330 |
+
temperature=0.5,
|
331 |
+
)
|
332 |
+
message = completions.choices[0].text
|
333 |
+
return message
|
334 |
+
|
335 |
+
if is_completed_analysis:
|
336 |
+
user_input = get_text()
|
337 |
+
print("user input is ", user_input)
|
338 |
+
print("the folder name at got here 0.5 is ", folder_name)
|
339 |
+
else:
|
340 |
+
user_input = None
|
341 |
+
|
342 |
+
if 'messages' not in st.session_state:
|
343 |
+
st.session_state['messages'] = get_initial_message()
|
344 |
+
|
345 |
+
if user_input:
|
346 |
+
print("got here 1")
|
347 |
+
print("the folder name at got here 1.5 is ", folder_name)
|
348 |
+
text_embedding = get_embedding_text(user_input)
|
349 |
+
print("the folder name at got here 1.5 is ", folder_name)
|
350 |
+
print("got here 2")
|
351 |
+
with open(f'{folder_name}/data_transcription.json', "r") as f:
|
352 |
+
title = json.load(f)['title']
|
353 |
+
string_title = "\n\n###\n\n".join(title)
|
354 |
+
user_input_embedding = 'Using this context: "'+string_title+'. '+text_embedding+'", answer the following question. \n'+user_input
|
355 |
+
print("got here 3")
|
356 |
+
output = generate_response(user_input_embedding)
|
357 |
+
st.session_state.past.append(user_input)
|
358 |
+
st.session_state.generated.append(output)
|
359 |
+
|
360 |
+
if st.session_state['generated']:
|
361 |
+
for i in range(len(st.session_state['generated'])-1, -1, -1):
|
362 |
+
message(st.session_state["generated"][i], key=str(i))
|
363 |
+
message(st.session_state['past'][i], is_user=True, key=str(i) + '_user')
|
364 |
+
|
365 |
+
|
366 |
+
# st.header("What else")
|
classifier.py
ADDED
File without changes
|
keywords.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import models as md
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
class Keywords:
|
5 |
+
|
6 |
+
def __init__(self, title_element:str):
|
7 |
+
self.title_element = []
|
8 |
+
self.kw_model = md.load_keyword_model()
|
9 |
+
|
10 |
+
def get_keywords(self, text_chunks_lib:dict) -> list:
|
11 |
+
kw_dict = dict()
|
12 |
+
text_chunk_counter = 0
|
13 |
+
|
14 |
+
for key in text_chunks_lib:
|
15 |
+
keywords_list = []
|
16 |
+
for text_chunk in text_chunks_lib[key]:
|
17 |
+
text_chunk_counter += 1
|
18 |
+
keywords_list += md.keyword_gen(self.kw_model, text_chunk)
|
19 |
+
kw_dict[key] = dict(keywords_list)
|
20 |
+
# Display as a dataframe
|
21 |
+
kw_df0 = pd.DataFrame.from_dict(kw_dict).reset_index()
|
22 |
+
kw_df0.rename(columns={'index': 'keyword'}, inplace=True)
|
23 |
+
kw_df = pd.melt(kw_df0, id_vars=['keyword'], var_name='title', value_name='score').dropna()
|
24 |
+
|
25 |
+
kw_column_list = ['keyword', 'score']
|
26 |
+
kw_df = kw_df[kw_df['score'] > 0.25][kw_column_list].sort_values(['score'], ascending=False).reset_index().drop(columns='index')
|
27 |
+
|
28 |
+
return kw_df['keyword']
|
mindmap.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import openai
|
3 |
+
import json
|
4 |
+
import graphviz
|
5 |
+
import streamlit as st
|
6 |
+
|
7 |
+
class MindMap:
|
8 |
+
|
9 |
+
def __init__(self):
|
10 |
+
openai.api_key = os.getenv("OPENAI_API_KEY")
|
11 |
+
|
12 |
+
def get_connections(self, text_chunks_libs:dict) -> list:
|
13 |
+
|
14 |
+
state_prompt = open("./prompts/mindmap.prompt")
|
15 |
+
PROMPT = state_prompt.read()
|
16 |
+
state_prompt.close()
|
17 |
+
|
18 |
+
final_connections = []
|
19 |
+
for key in text_chunks_libs:
|
20 |
+
for text_chunk in text_chunks_libs[key]:
|
21 |
+
PROMPT = PROMPT.replace("$prompt", text_chunk)
|
22 |
+
|
23 |
+
response = openai.Completion.create(
|
24 |
+
engine="text-davinci-003",
|
25 |
+
prompt = PROMPT,
|
26 |
+
temperature=0.5,
|
27 |
+
max_tokens=2048,
|
28 |
+
top_p=1,
|
29 |
+
frequency_penalty=0.0,
|
30 |
+
presence_penalty=0.0,
|
31 |
+
)
|
32 |
+
|
33 |
+
relationships = response.choices[0].text
|
34 |
+
final_string = '{"relations":' + relationships + '}'
|
35 |
+
data = json.loads(final_string)
|
36 |
+
print(data)
|
37 |
+
relations = data["relations"]
|
38 |
+
final_connections.extend(relations)
|
39 |
+
print(final_connections)
|
40 |
+
return final_connections
|
41 |
+
|
42 |
+
|
43 |
+
def generate_graph(self, text_chunks_libs:dict):
|
44 |
+
graph = graphviz.Digraph()
|
45 |
+
all_connections = self.get_connections(text_chunks_libs)
|
46 |
+
for connection in all_connections:
|
47 |
+
from_node = connection[0]
|
48 |
+
to_node = connection[2]
|
49 |
+
graph.edge(from_node, to_node)
|
50 |
+
st.graphviz_chart(graph)
|
models.py
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, BartTokenizer, BartForConditionalGeneration
|
2 |
+
|
3 |
+
import streamlit as st
|
4 |
+
from keybert import KeyBERT
|
5 |
+
import re
|
6 |
+
|
7 |
+
def create_nest_sentences(document:str, token_max_length = 1024):
|
8 |
+
nested = []
|
9 |
+
sent = []
|
10 |
+
length = 0
|
11 |
+
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')
|
12 |
+
|
13 |
+
for sentence in re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', document.replace("\n", ' ')):
|
14 |
+
tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)[0] # hugging face transformer tokenizer
|
15 |
+
length += len(tokens_in_sentence)
|
16 |
+
|
17 |
+
if length < token_max_length:
|
18 |
+
sent.append(sentence)
|
19 |
+
else:
|
20 |
+
nested.append(sent)
|
21 |
+
sent = [sentence]
|
22 |
+
length = 0
|
23 |
+
|
24 |
+
if sent:
|
25 |
+
nested.append(sent)
|
26 |
+
return nested
|
27 |
+
|
28 |
+
@st.cache_data
|
29 |
+
def load_keyword_model():
|
30 |
+
kw_model = KeyBERT()
|
31 |
+
return kw_model
|
32 |
+
|
33 |
+
def keyword_gen(kw_model, sequence:str):
|
34 |
+
keywords = kw_model.extract_keywords(sequence,
|
35 |
+
keyphrase_ngram_range=(1, 1),
|
36 |
+
stop_words='english',
|
37 |
+
use_mmr=True,
|
38 |
+
diversity=0.5,
|
39 |
+
top_n=10)
|
40 |
+
return keywords
|
41 |
+
|
42 |
+
|
43 |
+
|
44 |
+
# Reference: https://huggingface.co/facebook/bart-large-mnli
|
45 |
+
@st.cache_data
|
46 |
+
def load_summary_model():
|
47 |
+
model_name = "facebook/bart-large-cnn"
|
48 |
+
summarizer = pipeline(task='summarization', model=model_name)
|
49 |
+
return summarizer
|
50 |
+
|
51 |
+
def load_summary_model_large():
|
52 |
+
model_name = "facebook/bart-large-mnli"
|
53 |
+
tokenizer = BartTokenizer.from_pretrained(model_name)
|
54 |
+
model = BartForConditionalGeneration.from_pretrained(model_name)
|
55 |
+
summarizer = pipeline(task='summarization', model=model, tokenizer=tokenizer, framework='pt')
|
56 |
+
return summarizer
|
57 |
+
|
58 |
+
def summarizer_gen(summarizer, sequence:str, maximum_tokens:int, minimum_tokens:int):
|
59 |
+
output = summarizer(sequence,
|
60 |
+
num_beams=4,
|
61 |
+
length_penalty=2.0,
|
62 |
+
max_length=maximum_tokens,
|
63 |
+
min_length=minimum_tokens,
|
64 |
+
do_sample=False,
|
65 |
+
early_stopping = True,
|
66 |
+
no_repeat_ngram_size=3)
|
67 |
+
return output[0].get('summary_text')
|
68 |
+
|
69 |
+
|
70 |
+
# # Reference: https://www.datatrigger.org/post/nlp_hugging_face/
|
71 |
+
# # Custom summarization pipeline (to handle long articles)
|
72 |
+
# def summarize(text, minimum_length_of_summary = 100):
|
73 |
+
# # Tokenize and truncate
|
74 |
+
# inputs = tokenizer([text], truncation=True, max_length=1024, return_tensors='pt').to('cuda')
|
75 |
+
# # Generate summary
|
76 |
+
# summary_ids = model_bart.generate(inputs['input_ids'], num_beams=4, min_length = minimum_length_of_summary, max_length=400, early_stopping=True)
|
77 |
+
# # Untokenize
|
78 |
+
# return([tokenizer_bart.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids][0])
|
79 |
+
|
80 |
+
|
81 |
+
# Reference: https://huggingface.co/spaces/team-zero-shot-nli/zero-shot-nli/blob/main/utils.py
|
82 |
+
@st.cache_data
|
83 |
+
def load_model():
|
84 |
+
model_name = "facebook/bart-large-mnli"
|
85 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
86 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
87 |
+
classifier = pipeline(task='zero-shot-classification', model=model, tokenizer=tokenizer, framework='pt')
|
88 |
+
return classifier
|
89 |
+
|
90 |
+
def classifier_zero(classifier, sequence:str, labels:list, multi_class:bool):
|
91 |
+
outputs = classifier(sequence, labels, multi_label=multi_class)
|
92 |
+
return outputs['labels'], outputs['scores']
|
93 |
+
|
prompts/mindmap.prompt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Given a prompt, extrapolate as many relationships as possible from it and provide a list of updates.
|
2 |
+
|
3 |
+
If an update is a relationship, provide [ENTITY 1, RELATIONSHIP, ENTITY 2]. The relationship is directed, so the order matters.
|
4 |
+
|
5 |
+
Example:
|
6 |
+
prompt: Alice is Bob's roommate. Bob is Charlie's friend.
|
7 |
+
updates:
|
8 |
+
[["Alice", "roommate", "Bob"], ["Bob", "friend", "Charlie"]]
|
9 |
+
|
10 |
+
prompt: $prompt
|
11 |
+
updates:
|
summary.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import models as md
|
2 |
+
import nltk
|
3 |
+
|
4 |
+
import openai
|
5 |
+
import os
|
6 |
+
|
7 |
+
nltk.download("punkt")
|
8 |
+
|
9 |
+
class TextSummarizer:
|
10 |
+
|
11 |
+
def __init__(self, title):
|
12 |
+
self.title = title
|
13 |
+
self.model = "gpt-3.5-turbo"
|
14 |
+
self.summarizer = md.load_summary_model()
|
15 |
+
openai.api_key = os.getenv("OPENAI_API_KEY")
|
16 |
+
|
17 |
+
def generate_short_summary(self, summary_chunks:dict) -> list:
|
18 |
+
PROMPT = """
|
19 |
+
You are a helpful assistant that summarizes youtube videos.
|
20 |
+
Someone has already summarized the video to key points.
|
21 |
+
Summarize the key points in at most two sentences that capture the essence of the passage.
|
22 |
+
"""
|
23 |
+
|
24 |
+
final_summary = []
|
25 |
+
for summary_chunk in summary_chunks:
|
26 |
+
response = openai.ChatCompletion.create(
|
27 |
+
model=self.model,
|
28 |
+
messages=[
|
29 |
+
{"role": "system", "content": PROMPT},
|
30 |
+
{"role": "user", "content": summary_chunk},
|
31 |
+
],
|
32 |
+
)
|
33 |
+
summary = response["choices"][0]["message"]["content"]
|
34 |
+
final_summary.append(summary)
|
35 |
+
|
36 |
+
return final_summary
|
37 |
+
|
38 |
+
|
39 |
+
|
40 |
+
def generate_full_summary(self, text_chunks_lib:dict) -> str:
|
41 |
+
sum_dict = dict()
|
42 |
+
for _, key in enumerate(text_chunks_lib):
|
43 |
+
|
44 |
+
# for key in text_chunks_lib:
|
45 |
+
summary = []
|
46 |
+
for _, text_chunk in enumerate(text_chunks_lib[key]):
|
47 |
+
chunk_summary = md.summarizer_gen(self.summarizer, sequence=text_chunk, maximum_tokens=500, minimum_tokens=100)
|
48 |
+
summary.append(chunk_summary)
|
49 |
+
|
50 |
+
# Combine all the summaries into a list and compress into one document, again
|
51 |
+
final_summary = "\n\n".join(list(summary))
|
52 |
+
sum_dict[key] = [final_summary]
|
53 |
+
|
54 |
+
return sum_dict[self.title][0]
|
55 |
+
|
takeaways.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import openai
|
2 |
+
import os
|
3 |
+
|
4 |
+
def extract_bullet_points(text):
|
5 |
+
"""
|
6 |
+
Extract bullet points from a string and return a list of bullet points.
|
7 |
+
|
8 |
+
Args:
|
9 |
+
text (str): The input text containing bullet points.
|
10 |
+
|
11 |
+
Returns:
|
12 |
+
list: A list of bullet points.
|
13 |
+
"""
|
14 |
+
bullet_points = []
|
15 |
+
lines = text.split("\n")
|
16 |
+
for line in lines:
|
17 |
+
# Check if the line starts with a bullet point (e.g. "1. ", "2. ", etc.)
|
18 |
+
if line.strip().startswith(("* ", "- ", "• ", "· ", "1. ", "2. ", "3. ", "4. ", "5. ", "6. ", "7. ", "8. ", "9. ")):
|
19 |
+
bullet_points.append(line.strip()[2:])
|
20 |
+
return bullet_points
|
21 |
+
|
22 |
+
|
23 |
+
class KeyTakeaways:
|
24 |
+
|
25 |
+
def __init__(self):
|
26 |
+
openai.api_key = os.getenv("OPENAI_API_KEY")
|
27 |
+
|
28 |
+
def generate_key_takeaways(self, text_chunks_lib:dict) -> list:
|
29 |
+
PROMPT = """
|
30 |
+
You are a super intelligent human and helpful assistant.
|
31 |
+
I am giving you parts of a video transcription that I want to learn from.
|
32 |
+
In bullet points, give me at most 3 key takeaways from this text.
|
33 |
+
"""
|
34 |
+
|
35 |
+
final_takeaways = []
|
36 |
+
for key in text_chunks_lib:
|
37 |
+
for text_chunk in text_chunks_lib[key]:
|
38 |
+
response = openai.Completion.create(
|
39 |
+
engine="text-davinci-003",
|
40 |
+
prompt=PROMPT + text_chunk,
|
41 |
+
temperature=0.4,
|
42 |
+
max_tokens=1024,
|
43 |
+
top_p=1,
|
44 |
+
frequency_penalty=0.0,
|
45 |
+
presence_penalty=0.6,
|
46 |
+
)
|
47 |
+
takeaways = extract_bullet_points(response.choices[0].text.strip())
|
48 |
+
final_takeaways.extend(takeaways)
|
49 |
+
|
50 |
+
|
51 |
+
return final_takeaways
|
transcription.py
ADDED
@@ -0,0 +1,302 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# For downloading from youtube and transcribing audio
|
2 |
+
from pytube import YouTube
|
3 |
+
from moviepy.editor import *
|
4 |
+
from pydub import AudioSegment
|
5 |
+
from pydub.utils import make_chunks
|
6 |
+
import pydub
|
7 |
+
from pathlib import Path
|
8 |
+
|
9 |
+
# For getting text from PDF
|
10 |
+
from zipfile import ZipFile
|
11 |
+
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
12 |
+
from pdfminer.converter import TextConverter
|
13 |
+
from pdfminer.layout import LAParams
|
14 |
+
from pdfminer.pdfpage import PDFPage
|
15 |
+
from io import StringIO
|
16 |
+
|
17 |
+
# For transcription
|
18 |
+
import openai, whisper, torch
|
19 |
+
from faster_whisper import WhisperModel
|
20 |
+
import tiktoken
|
21 |
+
from nltk import tokenize
|
22 |
+
|
23 |
+
# For other stuff
|
24 |
+
import os, re
|
25 |
+
import time, math
|
26 |
+
|
27 |
+
# USEFUL CONSTANTS
|
28 |
+
|
29 |
+
# Duration is set to 6 minutes = 360 seconds = 360000 milliseconds
|
30 |
+
DURATION = 360000
|
31 |
+
|
32 |
+
# Maximum audio file size is 18MB
|
33 |
+
MAX_FILE_SIZE_BYTES = 18000000
|
34 |
+
|
35 |
+
# The model to use for transcription
|
36 |
+
WHISPER_MODEL = "tiny"
|
37 |
+
MODEL_SIZE = "base"
|
38 |
+
|
39 |
+
class DownloadAudio:
|
40 |
+
"""Downloads the audio from a youtube video and saves it to multiple .wav files in the specified folder"""
|
41 |
+
|
42 |
+
def __init__(self, link) -> None:
|
43 |
+
self.link = link
|
44 |
+
self.yt = YouTube(self.link)
|
45 |
+
self.YOUTUBE_VIDEO_ID = link.split("=")[1]
|
46 |
+
self.WAV_FILE_NAME = f"{self.YOUTUBE_VIDEO_ID}.wav"
|
47 |
+
|
48 |
+
def get_yt_title(self) -> str:
|
49 |
+
"""Returns the title of the youtube video"""
|
50 |
+
while True:
|
51 |
+
try:
|
52 |
+
title = self.yt.title
|
53 |
+
return title
|
54 |
+
except:
|
55 |
+
print("Failed to get name. Retrying...")
|
56 |
+
time.sleep(1)
|
57 |
+
self.yt = YouTube(self.link)
|
58 |
+
continue
|
59 |
+
|
60 |
+
def download(self, pathname:str):
|
61 |
+
"""
|
62 |
+
Download the audio from the youtube video and saves it to multiple .wav files
|
63 |
+
in the specified folder. Returns a list of the paths to the .wav files.
|
64 |
+
"""
|
65 |
+
|
66 |
+
# Check if the folder for the VIDEO_ID exists
|
67 |
+
if not os.path.exists(pathname):
|
68 |
+
os.mkdir(pathname)
|
69 |
+
FINAL_WAV_PATH = f"{pathname}/{self.WAV_FILE_NAME}"
|
70 |
+
|
71 |
+
if not os.path.exists(FINAL_WAV_PATH):
|
72 |
+
# Download the .mp4 file
|
73 |
+
audiostream = self.yt.streams.filter(only_audio=True).first()
|
74 |
+
outfile_path = audiostream.download(pathname)
|
75 |
+
|
76 |
+
# Convert the .mp4 file to .wav
|
77 |
+
wav_file = AudioFileClip(outfile_path)
|
78 |
+
wav_file.write_audiofile(FINAL_WAV_PATH, bitrate="16k", fps=16000)
|
79 |
+
|
80 |
+
# Load the input .wav file
|
81 |
+
audio = AudioSegment.from_wav(FINAL_WAV_PATH)
|
82 |
+
|
83 |
+
# Get the duration of the input file in milliseconds
|
84 |
+
total_byte_size = os.path.getsize(FINAL_WAV_PATH)
|
85 |
+
|
86 |
+
# If the total duration is less than the duration of each segment,
|
87 |
+
# then just return the original file
|
88 |
+
if total_byte_size < MAX_FILE_SIZE_BYTES:
|
89 |
+
return FINAL_WAV_PATH
|
90 |
+
|
91 |
+
# Get the size of the wav file
|
92 |
+
channels = audio.channels
|
93 |
+
sample_width = audio.sample_width
|
94 |
+
duration_in_sec = math.ceil(len(audio) / 1000)
|
95 |
+
sample_rate = audio.frame_rate
|
96 |
+
bit_rate = sample_width * 8
|
97 |
+
wav_file_size = (sample_rate * bit_rate * channels * duration_in_sec) / 8
|
98 |
+
|
99 |
+
# Get the length of each chunk in milliseconds and make the chunks
|
100 |
+
chunk_length_in_sec = math.ceil((duration_in_sec * MAX_FILE_SIZE_BYTES ) / wav_file_size) #in sec
|
101 |
+
chunk_length_ms = chunk_length_in_sec * 1000
|
102 |
+
chunks = make_chunks(audio, chunk_length_ms)
|
103 |
+
|
104 |
+
# Export all of the individual chunks as wav files
|
105 |
+
chunk_names = []
|
106 |
+
for i, chunk in enumerate(chunks):
|
107 |
+
chunk_name = f"{self.YOUTUBE_VIDEO_ID}_{i}.wav"
|
108 |
+
output_chunk_path = f"{pathname}/{chunk_name}"
|
109 |
+
chunk_names.append(output_chunk_path)
|
110 |
+
chunk.export(f"{output_chunk_path}", format="wav")
|
111 |
+
|
112 |
+
return FINAL_WAV_PATH
|
113 |
+
|
114 |
+
|
115 |
+
class VideoTranscription:
|
116 |
+
"""Performs transcription on a PDF or a link to a youtube video"""
|
117 |
+
|
118 |
+
def __init__(self, datalink) -> None:
|
119 |
+
self.datalink = datalink
|
120 |
+
self.encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
|
121 |
+
self.model = WhisperModel(WHISPER_MODEL, device="cpu", compute_type="int8")
|
122 |
+
openai.api_key = os.environ.get("OPENAI_API_KEY")
|
123 |
+
|
124 |
+
def transcribe(self) -> dict:
|
125 |
+
"""Returns the transcription of the PDF or youtube video as a string"""
|
126 |
+
|
127 |
+
start_time = time.time()
|
128 |
+
if self.datalink.startswith("http"):
|
129 |
+
transcript = self.get_text_from_link()
|
130 |
+
else:
|
131 |
+
transcript = self.get_text_from_pdf()
|
132 |
+
end_time = time.time()
|
133 |
+
print(f"transcription took {end_time - start_time} seconds")
|
134 |
+
return transcript
|
135 |
+
|
136 |
+
def get_text_from_link(self) -> dict:
|
137 |
+
|
138 |
+
# Get the names of the stored wav files
|
139 |
+
YOUTUBE_VIDEO_ID = self.datalink.split("=")[1]
|
140 |
+
FOLDER_NAME = f"./tests/{YOUTUBE_VIDEO_ID}"
|
141 |
+
|
142 |
+
# Get the audio file
|
143 |
+
audio_file = DownloadAudio(self.datalink)
|
144 |
+
|
145 |
+
# Get the names of the stored wav files
|
146 |
+
original_file_name = audio_file.download(FOLDER_NAME)
|
147 |
+
print(original_file_name)
|
148 |
+
# Get the transcription of each audio chunk
|
149 |
+
text_transcriptions = ""
|
150 |
+
# for file_name in file_names:
|
151 |
+
# Get the transcription
|
152 |
+
chunk_segments, _ = self.model.transcribe(original_file_name, beam_size=5)
|
153 |
+
for chunk_segment in chunk_segments:
|
154 |
+
text_transcriptions += chunk_segment.text.replace("$", "\$")
|
155 |
+
|
156 |
+
# Tokenize each sentence of the transcription.
|
157 |
+
sentences = tokenize.sent_tokenize(text_transcriptions)
|
158 |
+
segments = []
|
159 |
+
for i, sentence in enumerate(sentences):
|
160 |
+
segment = {
|
161 |
+
"id":i,
|
162 |
+
"text":sentence,
|
163 |
+
"tokens":self.encoding.encode(sentence)
|
164 |
+
}
|
165 |
+
segments.append(segment)
|
166 |
+
|
167 |
+
final_transcription = {
|
168 |
+
"title": audio_file.get_yt_title(),
|
169 |
+
"text": text_transcriptions,
|
170 |
+
"segments": segments
|
171 |
+
}
|
172 |
+
|
173 |
+
return final_transcription
|
174 |
+
|
175 |
+
|
176 |
+
class AudioTranscription:
|
177 |
+
"""Performs transcription on a MP3 file"""
|
178 |
+
|
179 |
+
def __init__(self, audio_file) -> None:
|
180 |
+
self.file = audio_file
|
181 |
+
self.title = self.file.name
|
182 |
+
self.folder_name = f"./tests/{self.title}".replace(' ', '')
|
183 |
+
self.folder_name = self.folder_name[:self.folder_name.rindex('.')]
|
184 |
+
self.encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
|
185 |
+
self.model = WhisperModel(WHISPER_MODEL, device="cpu", compute_type="int8")
|
186 |
+
openai.api_key = os.environ.get("OPENAI_API_KEY")
|
187 |
+
|
188 |
+
def get_redacted_name(self):
|
189 |
+
return self.folder_name
|
190 |
+
|
191 |
+
def transcribe(self) -> dict:
|
192 |
+
"""Returns the transcription of the MP3 audio as a string"""
|
193 |
+
|
194 |
+
start_time = time.time()
|
195 |
+
if not os.path.exists(self.folder_name):
|
196 |
+
os.mkdir(self.folder_name)
|
197 |
+
|
198 |
+
if self.title.endswith('wav'):
|
199 |
+
audio = pydub.AudioSegment.from_wav(self.file)
|
200 |
+
file_type = 'wav'
|
201 |
+
elif self.title.endswith('mp3'):
|
202 |
+
audio = pydub.AudioSegment.from_mp3(self.file)
|
203 |
+
file_type = 'mp3'
|
204 |
+
|
205 |
+
save_path = Path(self.folder_name) / self.file.name
|
206 |
+
audio.export(save_path, format=file_type)
|
207 |
+
final_wav_path = save_path
|
208 |
+
|
209 |
+
if file_type == 'mp3':
|
210 |
+
sound = AudioSegment.from_mp3(save_path)
|
211 |
+
final_wav_path = self.folder_name + "/" + self.title[:-4]+'.wav'
|
212 |
+
sound.export(final_wav_path, format="wav")
|
213 |
+
|
214 |
+
chunk_segments, info = self.model.transcribe(final_wav_path, beam_size=5)
|
215 |
+
text_transcriptions = ""
|
216 |
+
for chunk_segment in chunk_segments:
|
217 |
+
text_transcriptions += chunk_segment.text.replace("$", "\$")
|
218 |
+
|
219 |
+
# Tokenize each sentence of the transcription.
|
220 |
+
sentences = tokenize.sent_tokenize(text_transcriptions)
|
221 |
+
segments = []
|
222 |
+
for i, sentence in enumerate(sentences):
|
223 |
+
segment = {
|
224 |
+
"id":i,
|
225 |
+
"text":sentence,
|
226 |
+
"tokens":self.encoding.encode(sentence)
|
227 |
+
}
|
228 |
+
segments.append(segment)
|
229 |
+
|
230 |
+
final_transcription = {
|
231 |
+
"title": self.title,
|
232 |
+
"text": text_transcriptions,
|
233 |
+
"segments": segments
|
234 |
+
}
|
235 |
+
end_time = time.time()
|
236 |
+
print(f"transcription took {end_time - start_time} seconds")
|
237 |
+
|
238 |
+
return final_transcription
|
239 |
+
|
240 |
+
def convert_pdf_to_txt_pages(path):
|
241 |
+
texts = []
|
242 |
+
rsrcmgr = PDFResourceManager()
|
243 |
+
retstr = StringIO()
|
244 |
+
laparams = LAParams()
|
245 |
+
device = TextConverter(rsrcmgr, retstr, laparams=laparams)
|
246 |
+
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
247 |
+
|
248 |
+
size = 0
|
249 |
+
c = 0
|
250 |
+
file_pages = PDFPage.get_pages(path)
|
251 |
+
nbPages = len(list(file_pages))
|
252 |
+
|
253 |
+
for page in PDFPage.get_pages(path):
|
254 |
+
interpreter.process_page(page)
|
255 |
+
t = retstr.getvalue()
|
256 |
+
if c == 0:
|
257 |
+
texts.append(t)
|
258 |
+
else:
|
259 |
+
texts.append(t[size:])
|
260 |
+
c = c + 1
|
261 |
+
size = len(t)
|
262 |
+
|
263 |
+
device.close()
|
264 |
+
retstr.close()
|
265 |
+
return texts, nbPages
|
266 |
+
|
267 |
+
class PDFTranscription:
|
268 |
+
|
269 |
+
def __init__(self, pdf_file):
|
270 |
+
self.file = pdf_file
|
271 |
+
self.title = pdf_file.name
|
272 |
+
self.folder_name = f"./tests/{self.title}".replace(' ', '')
|
273 |
+
self.folder_name = self.folder_name[:self.folder_name.rindex('.')]
|
274 |
+
self.encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
|
275 |
+
|
276 |
+
def get_redacted_name(self):
|
277 |
+
return self.folder_name
|
278 |
+
|
279 |
+
def transcribe(self):
|
280 |
+
text, nbpages = convert_pdf_to_txt_pages(self.file)
|
281 |
+
pdf_transcription = ''.join(text)
|
282 |
+
|
283 |
+
sentences = tokenize.sent_tokenize(pdf_transcription)
|
284 |
+
segments = []
|
285 |
+
for i, sentence in enumerate(sentences):
|
286 |
+
segment = {
|
287 |
+
"id":i,
|
288 |
+
"text":sentence,
|
289 |
+
"tokens":self.encoding.encode(sentence)
|
290 |
+
}
|
291 |
+
|
292 |
+
segments.append(segment)
|
293 |
+
|
294 |
+
final_transcription = {
|
295 |
+
"title":self.title,
|
296 |
+
"text":pdf_transcription,
|
297 |
+
"segments":segments
|
298 |
+
}
|
299 |
+
return final_transcription
|
300 |
+
|
301 |
+
|
302 |
+
|