lordvader31 commited on
Commit
79b94f8
·
1 Parent(s): 499c2f5

major update from bitbucket

Browse files
Files changed (9) hide show
  1. app.py +366 -0
  2. classifier.py +0 -0
  3. keywords.py +28 -0
  4. mindmap.py +50 -0
  5. models.py +93 -0
  6. prompts/mindmap.prompt +11 -0
  7. summary.py +55 -0
  8. takeaways.py +51 -0
  9. transcription.py +302 -0
app.py ADDED
@@ -0,0 +1,366 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Streamlit classes
2
+ import streamlit as st
3
+ from streamlit_agraph import agraph, Node, Edge, Config
4
+ from streamlit_chat import message
5
+
6
+ # Data manipulation and embeddings
7
+ import pandas as pd
8
+ import numpy as np
9
+ import openai
10
+ from openai.embeddings_utils import distances_from_embeddings
11
+ import whisper
12
+
13
+ # Exec tasks
14
+ import os, json
15
+ import math
16
+ import re
17
+
18
+ # Custom classes
19
+ from transcription import *
20
+ from keywords import Keywords
21
+ from summary import TextSummarizer
22
+ from takeaways import KeyTakeaways
23
+ from mindmap import MindMap
24
+ import models as md
25
+
26
+
27
+ REGEXP_YOUTUBE_URL = "^(https?\:\/\/)?((www\.)?youtube\.com|youtu\.be)\/.+$"
28
+
29
+ model = whisper.load_model('base')
30
+
31
+ output = ''
32
+ data = []
33
+ data_transcription = {"title":"", "text":""}
34
+ embeddings = []
35
+ text_chunks_lib = dict()
36
+ user_input = None
37
+
38
+ tldr = ""
39
+ summary = ""
40
+ takeaways = []
41
+
42
+ folder_name = "./tests"
43
+ input_accepted = False
44
+ is_completed_analysis = False
45
+
46
+ def get_initial_message():
47
+ messages=[
48
+ {"role": "system", "content": "You are a helpful AI Tutor. Who anwers brief questions about AI."},
49
+ {"role": "user", "content": "I want to learn AI"},
50
+ {"role": "assistant", "content": "Thats awesome, what do you want to know aboout AI"}
51
+ ]
52
+ return messages
53
+
54
+ nodes = []
55
+ edges = []
56
+
57
+ nodes.append( Node(id="Spiderman",
58
+ label="Peter Parker",
59
+ size=25,
60
+ shape="circularImage",
61
+ image="http://marvel-force-chart.surge.sh/marvel_force_chart_img/top_spiderman.png")
62
+ ) # includes **kwargs
63
+ nodes.append( Node(id="Captain_Marvel",
64
+ size=25,
65
+ shape="circularImage",
66
+ image="http://marvel-force-chart.surge.sh/marvel_force_chart_img/top_captainmarvel.png")
67
+ )
68
+ edges.append( Edge(source="Captain_Marvel",
69
+ label="friend_of",
70
+ target="Spiderman",
71
+ )
72
+ )
73
+
74
+ config = Config(width=750,
75
+ height=950,
76
+ directed=True,
77
+ physics=True,
78
+ hierarchical=False,
79
+ )
80
+
81
+
82
+ user_secret = os.getenv("OPENAI_API_KEY")
83
+
84
+ # Define the purpose of the application
85
+ st.header('Almithal')
86
+ st.subheader('Almithal is a comprehensive video and PDF study buddy.')
87
+ st.write('It provides a summary, transcription, key insights, a mind map and a Q&A feature where you can actually "talk" to the datasource.')
88
+
89
+ bar = st.progress(0)
90
+
91
+ # =========== SIDEBAR FOR GENERATION ===========
92
+ with st.sidebar:
93
+ youtube_link = st.text_input(label = "Type in your Youtube link", placeholder = "", key="url")
94
+ st.markdown("OR")
95
+ pdf_file = st.file_uploader("Upload your PDF", type="pdf")
96
+ st.markdown("OR")
97
+ audio_file = st.file_uploader("Upload your MP3 audio file", type=["wav", "mp3"])
98
+
99
+ gen_keywords = st.radio(
100
+ "Generate keywords from text?",
101
+ ('Yes', 'No')
102
+ )
103
+
104
+ gen_summary = st.radio(
105
+ "Generate summary from text? (recommended for label matching below, but will take longer)",
106
+ ('Yes', 'No')
107
+ )
108
+
109
+ if st.button("Start Analysis"):
110
+
111
+ # Check if it is a valid youtube URL
112
+ if re.search(REGEXP_YOUTUBE_URL, youtube_link):
113
+ vte = VideoTranscription(youtube_link)
114
+ YOUTUBE_VIDEO_ID = youtube_link.split("=")[1]
115
+ folder_name = f"./tests/{YOUTUBE_VIDEO_ID}"
116
+ if not os.path.exists(folder_name):
117
+ os.mkdir(folder_name)
118
+
119
+ with st.spinner('Running process...'):
120
+ data_transcription = vte.transcribe()
121
+ segments = data_transcription['segments']
122
+
123
+ with open(f"{folder_name}/data.json", "w") as f:
124
+ json.dump(data_transcription, f, indent=4)
125
+
126
+ # PDF Transcription
127
+ elif pdf_file is not None:
128
+ pte = PDFTranscription(pdf_file)
129
+ folder_name = pte.get_redacted_name()
130
+ if not os.path.exists(folder_name):
131
+ os.mkdir(folder_name)
132
+
133
+ with st.spinner('Running process...'):
134
+ data_transcription = pte.transcribe()
135
+ segments = data_transcription['segments']
136
+
137
+ # Audio transcription
138
+ elif audio_file is not None:
139
+ ate = AudioTranscription(audio_file)
140
+ folder_name = ate.get_redacted_name()
141
+ if not os.path.exists(f""):
142
+ os.mkdir(folder_name)
143
+
144
+ with st.spinner('Running process...'):
145
+ data_transcription = ate.transcribe()
146
+ segments = data_transcription['segments']
147
+
148
+ with open(f"{folder_name}/data.json", "w") as f:
149
+ json.dump(data_transcription, f, indent=4)
150
+
151
+ else:
152
+ st.error("Please type in your youtube link or upload the PDF")
153
+ st.experimental_rerun()
154
+
155
+ # Save the transcript information
156
+ with open(f"{folder_name}/data_transcription.json", "w") as f:
157
+ json.dump(data_transcription, f, indent=4)
158
+
159
+ # Generate embeddings
160
+ if not os.path.exists(f"{folder_name}/word_embeddings.csv"):
161
+ for i, segment in enumerate(segments):
162
+ bar.progress(max(math.ceil((i/len(segments) * 50)), 1))
163
+ response = openai.Embedding.create(
164
+ input= segment["text"].strip(),
165
+ model="text-embedding-ada-002"
166
+ )
167
+ embeddings = response['data'][0]['embedding']
168
+ meta = {
169
+ "text": segment["text"].strip(),
170
+ "embedding": embeddings
171
+ }
172
+ data.append(meta)
173
+
174
+ pd.DataFrame(data).to_csv(f'{folder_name}/word_embeddings.csv')
175
+ else:
176
+ data = pd.read_csv(f'{folder_name}/word_embeddings.csv')
177
+ embeddings = data["embedding"]
178
+
179
+ bar.progress(75)
180
+
181
+ text_df = pd.DataFrame.from_dict({"title": [data_transcription["title"]], "text":[data_transcription["text"]]})
182
+ input_accepted = True
183
+
184
+ with st.spinner('Breaking up the text and doing analysis...'):
185
+ # For each body of text, create text chunks of a certain token size required for the transformer
186
+ title_entry = text_df['title'][0]
187
+ print(title_entry)
188
+ for i in range(0, len(text_df)):
189
+ nested_sentences = md.create_nest_sentences(document=text_df['text'][i], token_max_length=1024)
190
+ # For each chunk of sentences (within the token max)
191
+ text_chunks = []
192
+ for n in range(0, len(nested_sentences)):
193
+ tc = " ".join(map(str, nested_sentences[n]))
194
+ text_chunks.append(tc)
195
+
196
+ text_chunks_lib[title_entry] = text_chunks
197
+
198
+ # Generate key takeaways
199
+ key_engine = Keywords(title_entry)
200
+ keywords = key_engine.get_keywords(text_chunks_lib)
201
+
202
+ # Generate the summary
203
+ if gen_summary == 'Yes':
204
+ se = TextSummarizer(title_entry)
205
+ text_transcription = data_transcription['text']
206
+ with st.spinner("Generating summary and TLDR..."):
207
+ summary = se.generate_full_summary(text_chunks_lib)
208
+ summary_list = summary.split("\n\n")
209
+ tldr = se.generate_short_summary(summary_list)
210
+
211
+ # Generate key takeaways
212
+ kt = KeyTakeaways()
213
+ with st.spinner("Generating key takeaways ... "):
214
+ takeaways = kt.generate_key_takeaways(text_chunks_lib)
215
+
216
+ is_completed_analysis = True
217
+ bar.progress(100)
218
+
219
+ if is_completed_analysis:
220
+ st.header("Key Takeaways")
221
+ st.write("Here are some of the key takeaways from the data:")
222
+ for takeaway in takeaways:
223
+ st.markdown(f"- {takeaway}")
224
+
225
+
226
+ tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs(["Introduction", "Summary", "Transcription", "Mind Map", "Keywords", "Q&A"])
227
+
228
+ # =========== INTRODUCTION ===========
229
+ with tab1:
230
+ st.subheader("Introduction")
231
+ st.markdown("## How do I use this?")
232
+ st.markdown("Do one of the following")
233
+ st.markdown('* Type in your youtube URL that you want worked on')
234
+ st.markdown('* Place the PDF file that you want worked on')
235
+ st.markdown("**Once the file / url has finished saving, a 'Start Analysis' button will appear. Click on this button to begin the note generation**")
236
+ st.warning("NOTE: This is just a demo product in alpha testing. Any and all bugs will soon be fixed")
237
+ st.warning("After the note taking is done, you will see multiple tabs for more information")
238
+
239
+ # =========== SUMMARIZATION ===========
240
+ with tab2:
241
+ if is_completed_analysis:
242
+ st.header("TL;DR")
243
+ for point in tldr:
244
+ st.markdown(f"- {point}")
245
+ st.header("Summary")
246
+ st.write(summary)
247
+ else:
248
+ st.warning("Please wait for the analysis to finish")
249
+
250
+ # =========== TRANSCRIPTION ===========
251
+ with tab3:
252
+ st.header("Transcription")
253
+ if is_completed_analysis:
254
+ with st.spinner("Generating transcript ..."):
255
+ st.write("")
256
+ for text in text_chunks_lib[title_entry]:
257
+ st.write(text)
258
+ else:
259
+ st.warning("Please wait for the analysis to finish")
260
+
261
+ # =========== MIND MAP ===========
262
+ with tab4:
263
+ st.header("Mind Map")
264
+ if is_completed_analysis:
265
+ mindmap = MindMap()
266
+ with st.spinner("Generating mind map..."):
267
+ mindmap.generate_graph(text_chunks_lib)
268
+ else:
269
+ st.warning("Please wait for the analysis to finish")
270
+
271
+ # =========== KEYWORDS ===========
272
+ with tab5:
273
+ st.header("Keywords:")
274
+ if is_completed_analysis and gen_keywords:
275
+ for i, keyword in enumerate(keywords):
276
+ st.markdown(f"{i+1}. {keyword}")
277
+ else:
278
+ st.warning("Please wait for the analysis to finish")
279
+
280
+ # =========== QUERY BOT ===========
281
+ with tab6:
282
+ if 'generated' not in st.session_state:
283
+ st.session_state['generated'] = []
284
+
285
+ if 'past' not in st.session_state:
286
+ st.session_state['past'] = []
287
+
288
+ def get_text():
289
+ st.header("Ask me something about the video:")
290
+ input_text = st.text_input("You: ", key="prompt")
291
+ return input_text
292
+
293
+
294
+ def get_embedding_text(prompt):
295
+ response = openai.Embedding.create(
296
+ input= prompt.strip(),
297
+ model="text-embedding-ada-002"
298
+ )
299
+ q_embedding = response['data'][0]['embedding']
300
+ print("the folder name at got here 1.5 is ", folder_name)
301
+ df = pd.read_csv(f'{folder_name}/word_embeddings.csv', index_col=0)
302
+ df['embedding'] = df['embedding'].apply(eval).apply(np.array)
303
+
304
+ df['distances'] = distances_from_embeddings(q_embedding, df['embedding'].values, distance_metric='cosine')
305
+ returns = []
306
+
307
+ # Sort by distance with 2 hints
308
+ for i, row in df.sort_values('distances', ascending=True).head(4).iterrows():
309
+ # Else add it to the text that is being returned
310
+ returns.append(row["text"])
311
+
312
+ # Return the context
313
+ return "\n\n###\n\n".join(returns)
314
+
315
+ def generate_response(prompt):
316
+ one_shot_prompt = '''
317
+ I am YoutubeGPT, a highly intelligent question answering bot.
318
+ If you ask me a question that is rooted in truth, I will give you the answer.
319
+ Q: What is human life expectancy in the United States?
320
+ A: Human life expectancy in the United States is 78 years.
321
+ Q: '''+prompt+'''
322
+ A:
323
+ '''
324
+ completions = openai.Completion.create(
325
+ engine = "text-davinci-003",
326
+ prompt = one_shot_prompt,
327
+ max_tokens = 1024,
328
+ n = 1,
329
+ stop=["Q:"],
330
+ temperature=0.5,
331
+ )
332
+ message = completions.choices[0].text
333
+ return message
334
+
335
+ if is_completed_analysis:
336
+ user_input = get_text()
337
+ print("user input is ", user_input)
338
+ print("the folder name at got here 0.5 is ", folder_name)
339
+ else:
340
+ user_input = None
341
+
342
+ if 'messages' not in st.session_state:
343
+ st.session_state['messages'] = get_initial_message()
344
+
345
+ if user_input:
346
+ print("got here 1")
347
+ print("the folder name at got here 1.5 is ", folder_name)
348
+ text_embedding = get_embedding_text(user_input)
349
+ print("the folder name at got here 1.5 is ", folder_name)
350
+ print("got here 2")
351
+ with open(f'{folder_name}/data_transcription.json', "r") as f:
352
+ title = json.load(f)['title']
353
+ string_title = "\n\n###\n\n".join(title)
354
+ user_input_embedding = 'Using this context: "'+string_title+'. '+text_embedding+'", answer the following question. \n'+user_input
355
+ print("got here 3")
356
+ output = generate_response(user_input_embedding)
357
+ st.session_state.past.append(user_input)
358
+ st.session_state.generated.append(output)
359
+
360
+ if st.session_state['generated']:
361
+ for i in range(len(st.session_state['generated'])-1, -1, -1):
362
+ message(st.session_state["generated"][i], key=str(i))
363
+ message(st.session_state['past'][i], is_user=True, key=str(i) + '_user')
364
+
365
+
366
+ # st.header("What else")
classifier.py ADDED
File without changes
keywords.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import models as md
2
+ import pandas as pd
3
+
4
+ class Keywords:
5
+
6
+ def __init__(self, title_element:str):
7
+ self.title_element = []
8
+ self.kw_model = md.load_keyword_model()
9
+
10
+ def get_keywords(self, text_chunks_lib:dict) -> list:
11
+ kw_dict = dict()
12
+ text_chunk_counter = 0
13
+
14
+ for key in text_chunks_lib:
15
+ keywords_list = []
16
+ for text_chunk in text_chunks_lib[key]:
17
+ text_chunk_counter += 1
18
+ keywords_list += md.keyword_gen(self.kw_model, text_chunk)
19
+ kw_dict[key] = dict(keywords_list)
20
+ # Display as a dataframe
21
+ kw_df0 = pd.DataFrame.from_dict(kw_dict).reset_index()
22
+ kw_df0.rename(columns={'index': 'keyword'}, inplace=True)
23
+ kw_df = pd.melt(kw_df0, id_vars=['keyword'], var_name='title', value_name='score').dropna()
24
+
25
+ kw_column_list = ['keyword', 'score']
26
+ kw_df = kw_df[kw_df['score'] > 0.25][kw_column_list].sort_values(['score'], ascending=False).reset_index().drop(columns='index')
27
+
28
+ return kw_df['keyword']
mindmap.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import openai
3
+ import json
4
+ import graphviz
5
+ import streamlit as st
6
+
7
+ class MindMap:
8
+
9
+ def __init__(self):
10
+ openai.api_key = os.getenv("OPENAI_API_KEY")
11
+
12
+ def get_connections(self, text_chunks_libs:dict) -> list:
13
+
14
+ state_prompt = open("./prompts/mindmap.prompt")
15
+ PROMPT = state_prompt.read()
16
+ state_prompt.close()
17
+
18
+ final_connections = []
19
+ for key in text_chunks_libs:
20
+ for text_chunk in text_chunks_libs[key]:
21
+ PROMPT = PROMPT.replace("$prompt", text_chunk)
22
+
23
+ response = openai.Completion.create(
24
+ engine="text-davinci-003",
25
+ prompt = PROMPT,
26
+ temperature=0.5,
27
+ max_tokens=2048,
28
+ top_p=1,
29
+ frequency_penalty=0.0,
30
+ presence_penalty=0.0,
31
+ )
32
+
33
+ relationships = response.choices[0].text
34
+ final_string = '{"relations":' + relationships + '}'
35
+ data = json.loads(final_string)
36
+ print(data)
37
+ relations = data["relations"]
38
+ final_connections.extend(relations)
39
+ print(final_connections)
40
+ return final_connections
41
+
42
+
43
+ def generate_graph(self, text_chunks_libs:dict):
44
+ graph = graphviz.Digraph()
45
+ all_connections = self.get_connections(text_chunks_libs)
46
+ for connection in all_connections:
47
+ from_node = connection[0]
48
+ to_node = connection[2]
49
+ graph.edge(from_node, to_node)
50
+ st.graphviz_chart(graph)
models.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, BartTokenizer, BartForConditionalGeneration
2
+
3
+ import streamlit as st
4
+ from keybert import KeyBERT
5
+ import re
6
+
7
+ def create_nest_sentences(document:str, token_max_length = 1024):
8
+ nested = []
9
+ sent = []
10
+ length = 0
11
+ tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')
12
+
13
+ for sentence in re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', document.replace("\n", ' ')):
14
+ tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)[0] # hugging face transformer tokenizer
15
+ length += len(tokens_in_sentence)
16
+
17
+ if length < token_max_length:
18
+ sent.append(sentence)
19
+ else:
20
+ nested.append(sent)
21
+ sent = [sentence]
22
+ length = 0
23
+
24
+ if sent:
25
+ nested.append(sent)
26
+ return nested
27
+
28
+ @st.cache_data
29
+ def load_keyword_model():
30
+ kw_model = KeyBERT()
31
+ return kw_model
32
+
33
+ def keyword_gen(kw_model, sequence:str):
34
+ keywords = kw_model.extract_keywords(sequence,
35
+ keyphrase_ngram_range=(1, 1),
36
+ stop_words='english',
37
+ use_mmr=True,
38
+ diversity=0.5,
39
+ top_n=10)
40
+ return keywords
41
+
42
+
43
+
44
+ # Reference: https://huggingface.co/facebook/bart-large-mnli
45
+ @st.cache_data
46
+ def load_summary_model():
47
+ model_name = "facebook/bart-large-cnn"
48
+ summarizer = pipeline(task='summarization', model=model_name)
49
+ return summarizer
50
+
51
+ def load_summary_model_large():
52
+ model_name = "facebook/bart-large-mnli"
53
+ tokenizer = BartTokenizer.from_pretrained(model_name)
54
+ model = BartForConditionalGeneration.from_pretrained(model_name)
55
+ summarizer = pipeline(task='summarization', model=model, tokenizer=tokenizer, framework='pt')
56
+ return summarizer
57
+
58
+ def summarizer_gen(summarizer, sequence:str, maximum_tokens:int, minimum_tokens:int):
59
+ output = summarizer(sequence,
60
+ num_beams=4,
61
+ length_penalty=2.0,
62
+ max_length=maximum_tokens,
63
+ min_length=minimum_tokens,
64
+ do_sample=False,
65
+ early_stopping = True,
66
+ no_repeat_ngram_size=3)
67
+ return output[0].get('summary_text')
68
+
69
+
70
+ # # Reference: https://www.datatrigger.org/post/nlp_hugging_face/
71
+ # # Custom summarization pipeline (to handle long articles)
72
+ # def summarize(text, minimum_length_of_summary = 100):
73
+ # # Tokenize and truncate
74
+ # inputs = tokenizer([text], truncation=True, max_length=1024, return_tensors='pt').to('cuda')
75
+ # # Generate summary
76
+ # summary_ids = model_bart.generate(inputs['input_ids'], num_beams=4, min_length = minimum_length_of_summary, max_length=400, early_stopping=True)
77
+ # # Untokenize
78
+ # return([tokenizer_bart.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids][0])
79
+
80
+
81
+ # Reference: https://huggingface.co/spaces/team-zero-shot-nli/zero-shot-nli/blob/main/utils.py
82
+ @st.cache_data
83
+ def load_model():
84
+ model_name = "facebook/bart-large-mnli"
85
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
86
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
87
+ classifier = pipeline(task='zero-shot-classification', model=model, tokenizer=tokenizer, framework='pt')
88
+ return classifier
89
+
90
+ def classifier_zero(classifier, sequence:str, labels:list, multi_class:bool):
91
+ outputs = classifier(sequence, labels, multi_label=multi_class)
92
+ return outputs['labels'], outputs['scores']
93
+
prompts/mindmap.prompt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Given a prompt, extrapolate as many relationships as possible from it and provide a list of updates.
2
+
3
+ If an update is a relationship, provide [ENTITY 1, RELATIONSHIP, ENTITY 2]. The relationship is directed, so the order matters.
4
+
5
+ Example:
6
+ prompt: Alice is Bob's roommate. Bob is Charlie's friend.
7
+ updates:
8
+ [["Alice", "roommate", "Bob"], ["Bob", "friend", "Charlie"]]
9
+
10
+ prompt: $prompt
11
+ updates:
summary.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import models as md
2
+ import nltk
3
+
4
+ import openai
5
+ import os
6
+
7
+ nltk.download("punkt")
8
+
9
+ class TextSummarizer:
10
+
11
+ def __init__(self, title):
12
+ self.title = title
13
+ self.model = "gpt-3.5-turbo"
14
+ self.summarizer = md.load_summary_model()
15
+ openai.api_key = os.getenv("OPENAI_API_KEY")
16
+
17
+ def generate_short_summary(self, summary_chunks:dict) -> list:
18
+ PROMPT = """
19
+ You are a helpful assistant that summarizes youtube videos.
20
+ Someone has already summarized the video to key points.
21
+ Summarize the key points in at most two sentences that capture the essence of the passage.
22
+ """
23
+
24
+ final_summary = []
25
+ for summary_chunk in summary_chunks:
26
+ response = openai.ChatCompletion.create(
27
+ model=self.model,
28
+ messages=[
29
+ {"role": "system", "content": PROMPT},
30
+ {"role": "user", "content": summary_chunk},
31
+ ],
32
+ )
33
+ summary = response["choices"][0]["message"]["content"]
34
+ final_summary.append(summary)
35
+
36
+ return final_summary
37
+
38
+
39
+
40
+ def generate_full_summary(self, text_chunks_lib:dict) -> str:
41
+ sum_dict = dict()
42
+ for _, key in enumerate(text_chunks_lib):
43
+
44
+ # for key in text_chunks_lib:
45
+ summary = []
46
+ for _, text_chunk in enumerate(text_chunks_lib[key]):
47
+ chunk_summary = md.summarizer_gen(self.summarizer, sequence=text_chunk, maximum_tokens=500, minimum_tokens=100)
48
+ summary.append(chunk_summary)
49
+
50
+ # Combine all the summaries into a list and compress into one document, again
51
+ final_summary = "\n\n".join(list(summary))
52
+ sum_dict[key] = [final_summary]
53
+
54
+ return sum_dict[self.title][0]
55
+
takeaways.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import os
3
+
4
+ def extract_bullet_points(text):
5
+ """
6
+ Extract bullet points from a string and return a list of bullet points.
7
+
8
+ Args:
9
+ text (str): The input text containing bullet points.
10
+
11
+ Returns:
12
+ list: A list of bullet points.
13
+ """
14
+ bullet_points = []
15
+ lines = text.split("\n")
16
+ for line in lines:
17
+ # Check if the line starts with a bullet point (e.g. "1. ", "2. ", etc.)
18
+ if line.strip().startswith(("* ", "- ", "• ", "· ", "1. ", "2. ", "3. ", "4. ", "5. ", "6. ", "7. ", "8. ", "9. ")):
19
+ bullet_points.append(line.strip()[2:])
20
+ return bullet_points
21
+
22
+
23
+ class KeyTakeaways:
24
+
25
+ def __init__(self):
26
+ openai.api_key = os.getenv("OPENAI_API_KEY")
27
+
28
+ def generate_key_takeaways(self, text_chunks_lib:dict) -> list:
29
+ PROMPT = """
30
+ You are a super intelligent human and helpful assistant.
31
+ I am giving you parts of a video transcription that I want to learn from.
32
+ In bullet points, give me at most 3 key takeaways from this text.
33
+ """
34
+
35
+ final_takeaways = []
36
+ for key in text_chunks_lib:
37
+ for text_chunk in text_chunks_lib[key]:
38
+ response = openai.Completion.create(
39
+ engine="text-davinci-003",
40
+ prompt=PROMPT + text_chunk,
41
+ temperature=0.4,
42
+ max_tokens=1024,
43
+ top_p=1,
44
+ frequency_penalty=0.0,
45
+ presence_penalty=0.6,
46
+ )
47
+ takeaways = extract_bullet_points(response.choices[0].text.strip())
48
+ final_takeaways.extend(takeaways)
49
+
50
+
51
+ return final_takeaways
transcription.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # For downloading from youtube and transcribing audio
2
+ from pytube import YouTube
3
+ from moviepy.editor import *
4
+ from pydub import AudioSegment
5
+ from pydub.utils import make_chunks
6
+ import pydub
7
+ from pathlib import Path
8
+
9
+ # For getting text from PDF
10
+ from zipfile import ZipFile
11
+ from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
12
+ from pdfminer.converter import TextConverter
13
+ from pdfminer.layout import LAParams
14
+ from pdfminer.pdfpage import PDFPage
15
+ from io import StringIO
16
+
17
+ # For transcription
18
+ import openai, whisper, torch
19
+ from faster_whisper import WhisperModel
20
+ import tiktoken
21
+ from nltk import tokenize
22
+
23
+ # For other stuff
24
+ import os, re
25
+ import time, math
26
+
27
+ # USEFUL CONSTANTS
28
+
29
+ # Duration is set to 6 minutes = 360 seconds = 360000 milliseconds
30
+ DURATION = 360000
31
+
32
+ # Maximum audio file size is 18MB
33
+ MAX_FILE_SIZE_BYTES = 18000000
34
+
35
+ # The model to use for transcription
36
+ WHISPER_MODEL = "tiny"
37
+ MODEL_SIZE = "base"
38
+
39
+ class DownloadAudio:
40
+ """Downloads the audio from a youtube video and saves it to multiple .wav files in the specified folder"""
41
+
42
+ def __init__(self, link) -> None:
43
+ self.link = link
44
+ self.yt = YouTube(self.link)
45
+ self.YOUTUBE_VIDEO_ID = link.split("=")[1]
46
+ self.WAV_FILE_NAME = f"{self.YOUTUBE_VIDEO_ID}.wav"
47
+
48
+ def get_yt_title(self) -> str:
49
+ """Returns the title of the youtube video"""
50
+ while True:
51
+ try:
52
+ title = self.yt.title
53
+ return title
54
+ except:
55
+ print("Failed to get name. Retrying...")
56
+ time.sleep(1)
57
+ self.yt = YouTube(self.link)
58
+ continue
59
+
60
+ def download(self, pathname:str):
61
+ """
62
+ Download the audio from the youtube video and saves it to multiple .wav files
63
+ in the specified folder. Returns a list of the paths to the .wav files.
64
+ """
65
+
66
+ # Check if the folder for the VIDEO_ID exists
67
+ if not os.path.exists(pathname):
68
+ os.mkdir(pathname)
69
+ FINAL_WAV_PATH = f"{pathname}/{self.WAV_FILE_NAME}"
70
+
71
+ if not os.path.exists(FINAL_WAV_PATH):
72
+ # Download the .mp4 file
73
+ audiostream = self.yt.streams.filter(only_audio=True).first()
74
+ outfile_path = audiostream.download(pathname)
75
+
76
+ # Convert the .mp4 file to .wav
77
+ wav_file = AudioFileClip(outfile_path)
78
+ wav_file.write_audiofile(FINAL_WAV_PATH, bitrate="16k", fps=16000)
79
+
80
+ # Load the input .wav file
81
+ audio = AudioSegment.from_wav(FINAL_WAV_PATH)
82
+
83
+ # Get the duration of the input file in milliseconds
84
+ total_byte_size = os.path.getsize(FINAL_WAV_PATH)
85
+
86
+ # If the total duration is less than the duration of each segment,
87
+ # then just return the original file
88
+ if total_byte_size < MAX_FILE_SIZE_BYTES:
89
+ return FINAL_WAV_PATH
90
+
91
+ # Get the size of the wav file
92
+ channels = audio.channels
93
+ sample_width = audio.sample_width
94
+ duration_in_sec = math.ceil(len(audio) / 1000)
95
+ sample_rate = audio.frame_rate
96
+ bit_rate = sample_width * 8
97
+ wav_file_size = (sample_rate * bit_rate * channels * duration_in_sec) / 8
98
+
99
+ # Get the length of each chunk in milliseconds and make the chunks
100
+ chunk_length_in_sec = math.ceil((duration_in_sec * MAX_FILE_SIZE_BYTES ) / wav_file_size) #in sec
101
+ chunk_length_ms = chunk_length_in_sec * 1000
102
+ chunks = make_chunks(audio, chunk_length_ms)
103
+
104
+ # Export all of the individual chunks as wav files
105
+ chunk_names = []
106
+ for i, chunk in enumerate(chunks):
107
+ chunk_name = f"{self.YOUTUBE_VIDEO_ID}_{i}.wav"
108
+ output_chunk_path = f"{pathname}/{chunk_name}"
109
+ chunk_names.append(output_chunk_path)
110
+ chunk.export(f"{output_chunk_path}", format="wav")
111
+
112
+ return FINAL_WAV_PATH
113
+
114
+
115
+ class VideoTranscription:
116
+ """Performs transcription on a PDF or a link to a youtube video"""
117
+
118
+ def __init__(self, datalink) -> None:
119
+ self.datalink = datalink
120
+ self.encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
121
+ self.model = WhisperModel(WHISPER_MODEL, device="cpu", compute_type="int8")
122
+ openai.api_key = os.environ.get("OPENAI_API_KEY")
123
+
124
+ def transcribe(self) -> dict:
125
+ """Returns the transcription of the PDF or youtube video as a string"""
126
+
127
+ start_time = time.time()
128
+ if self.datalink.startswith("http"):
129
+ transcript = self.get_text_from_link()
130
+ else:
131
+ transcript = self.get_text_from_pdf()
132
+ end_time = time.time()
133
+ print(f"transcription took {end_time - start_time} seconds")
134
+ return transcript
135
+
136
+ def get_text_from_link(self) -> dict:
137
+
138
+ # Get the names of the stored wav files
139
+ YOUTUBE_VIDEO_ID = self.datalink.split("=")[1]
140
+ FOLDER_NAME = f"./tests/{YOUTUBE_VIDEO_ID}"
141
+
142
+ # Get the audio file
143
+ audio_file = DownloadAudio(self.datalink)
144
+
145
+ # Get the names of the stored wav files
146
+ original_file_name = audio_file.download(FOLDER_NAME)
147
+ print(original_file_name)
148
+ # Get the transcription of each audio chunk
149
+ text_transcriptions = ""
150
+ # for file_name in file_names:
151
+ # Get the transcription
152
+ chunk_segments, _ = self.model.transcribe(original_file_name, beam_size=5)
153
+ for chunk_segment in chunk_segments:
154
+ text_transcriptions += chunk_segment.text.replace("$", "\$")
155
+
156
+ # Tokenize each sentence of the transcription.
157
+ sentences = tokenize.sent_tokenize(text_transcriptions)
158
+ segments = []
159
+ for i, sentence in enumerate(sentences):
160
+ segment = {
161
+ "id":i,
162
+ "text":sentence,
163
+ "tokens":self.encoding.encode(sentence)
164
+ }
165
+ segments.append(segment)
166
+
167
+ final_transcription = {
168
+ "title": audio_file.get_yt_title(),
169
+ "text": text_transcriptions,
170
+ "segments": segments
171
+ }
172
+
173
+ return final_transcription
174
+
175
+
176
+ class AudioTranscription:
177
+ """Performs transcription on a MP3 file"""
178
+
179
+ def __init__(self, audio_file) -> None:
180
+ self.file = audio_file
181
+ self.title = self.file.name
182
+ self.folder_name = f"./tests/{self.title}".replace(' ', '')
183
+ self.folder_name = self.folder_name[:self.folder_name.rindex('.')]
184
+ self.encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
185
+ self.model = WhisperModel(WHISPER_MODEL, device="cpu", compute_type="int8")
186
+ openai.api_key = os.environ.get("OPENAI_API_KEY")
187
+
188
+ def get_redacted_name(self):
189
+ return self.folder_name
190
+
191
+ def transcribe(self) -> dict:
192
+ """Returns the transcription of the MP3 audio as a string"""
193
+
194
+ start_time = time.time()
195
+ if not os.path.exists(self.folder_name):
196
+ os.mkdir(self.folder_name)
197
+
198
+ if self.title.endswith('wav'):
199
+ audio = pydub.AudioSegment.from_wav(self.file)
200
+ file_type = 'wav'
201
+ elif self.title.endswith('mp3'):
202
+ audio = pydub.AudioSegment.from_mp3(self.file)
203
+ file_type = 'mp3'
204
+
205
+ save_path = Path(self.folder_name) / self.file.name
206
+ audio.export(save_path, format=file_type)
207
+ final_wav_path = save_path
208
+
209
+ if file_type == 'mp3':
210
+ sound = AudioSegment.from_mp3(save_path)
211
+ final_wav_path = self.folder_name + "/" + self.title[:-4]+'.wav'
212
+ sound.export(final_wav_path, format="wav")
213
+
214
+ chunk_segments, info = self.model.transcribe(final_wav_path, beam_size=5)
215
+ text_transcriptions = ""
216
+ for chunk_segment in chunk_segments:
217
+ text_transcriptions += chunk_segment.text.replace("$", "\$")
218
+
219
+ # Tokenize each sentence of the transcription.
220
+ sentences = tokenize.sent_tokenize(text_transcriptions)
221
+ segments = []
222
+ for i, sentence in enumerate(sentences):
223
+ segment = {
224
+ "id":i,
225
+ "text":sentence,
226
+ "tokens":self.encoding.encode(sentence)
227
+ }
228
+ segments.append(segment)
229
+
230
+ final_transcription = {
231
+ "title": self.title,
232
+ "text": text_transcriptions,
233
+ "segments": segments
234
+ }
235
+ end_time = time.time()
236
+ print(f"transcription took {end_time - start_time} seconds")
237
+
238
+ return final_transcription
239
+
240
+ def convert_pdf_to_txt_pages(path):
241
+ texts = []
242
+ rsrcmgr = PDFResourceManager()
243
+ retstr = StringIO()
244
+ laparams = LAParams()
245
+ device = TextConverter(rsrcmgr, retstr, laparams=laparams)
246
+ interpreter = PDFPageInterpreter(rsrcmgr, device)
247
+
248
+ size = 0
249
+ c = 0
250
+ file_pages = PDFPage.get_pages(path)
251
+ nbPages = len(list(file_pages))
252
+
253
+ for page in PDFPage.get_pages(path):
254
+ interpreter.process_page(page)
255
+ t = retstr.getvalue()
256
+ if c == 0:
257
+ texts.append(t)
258
+ else:
259
+ texts.append(t[size:])
260
+ c = c + 1
261
+ size = len(t)
262
+
263
+ device.close()
264
+ retstr.close()
265
+ return texts, nbPages
266
+
267
+ class PDFTranscription:
268
+
269
+ def __init__(self, pdf_file):
270
+ self.file = pdf_file
271
+ self.title = pdf_file.name
272
+ self.folder_name = f"./tests/{self.title}".replace(' ', '')
273
+ self.folder_name = self.folder_name[:self.folder_name.rindex('.')]
274
+ self.encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
275
+
276
+ def get_redacted_name(self):
277
+ return self.folder_name
278
+
279
+ def transcribe(self):
280
+ text, nbpages = convert_pdf_to_txt_pages(self.file)
281
+ pdf_transcription = ''.join(text)
282
+
283
+ sentences = tokenize.sent_tokenize(pdf_transcription)
284
+ segments = []
285
+ for i, sentence in enumerate(sentences):
286
+ segment = {
287
+ "id":i,
288
+ "text":sentence,
289
+ "tokens":self.encoding.encode(sentence)
290
+ }
291
+
292
+ segments.append(segment)
293
+
294
+ final_transcription = {
295
+ "title":self.title,
296
+ "text":pdf_transcription,
297
+ "segments":segments
298
+ }
299
+ return final_transcription
300
+
301
+
302
+