Spaces:

lordvader31
/

almithal

Running

App Files Files Community

Keane Moraes commited on May 4, 2023

Commit

f76e4eb

1 Parent(s): 625fc77

threading changes

Browse files

Files changed (1) hide show

app.py +59 -49

app.py CHANGED Viewed

@@ -14,6 +14,7 @@ import whisper
 import os, json
 import math
 import re
 # Custom classes
 from transcription import *
@@ -41,10 +42,12 @@ data_transcription = {"title":"", "text":""}
 embeddings = []
 text_chunks_lib = dict()
 user_input = None
 tldr = ""
 summary = ""
 takeaways = []
 folder_name = "./tests"
 input_accepted = False
@@ -61,6 +64,47 @@ st.write('It provides a summary, transcription, key insights, a mind map and a Q
 bar = st.progress(0)
 # =========== SIDEBAR FOR GENERATION ===========
 with st.sidebar:
     youtube_link = st.text_input(label = "Type in your Youtube link", placeholder = "", key="url")
@@ -81,7 +125,7 @@ with st.sidebar:
     if st.button("Start Analysis"):
-        # Check if it is a valid youtube URL
         if re.search(REGEXP_YOUTUBE_URL, youtube_link):
             vte = VideoTranscription(youtube_link)
             YOUTUBE_VIDEO_ID = youtube_link.split("=")[1]
@@ -89,13 +133,10 @@ with st.sidebar:
             if not os.path.exists(folder_name):
                 os.mkdir(folder_name)
-            with st.spinner('Running process...'):
                 data_transcription = vte.transcribe()
                 segments = data_transcription['segments']
-            with open(f"{folder_name}/data.json", "w") as f:
-                json.dump(data_transcription, f, indent=4)
         # PDF Transcription
         elif pdf_file is not None:
             pte = PDFTranscription(pdf_file)
@@ -103,7 +144,7 @@ with st.sidebar:
             if not os.path.exists(folder_name):
                 os.mkdir(folder_name)
-            with st.spinner('Running process...'):
                 data_transcription = pte.transcribe()
                 segments = data_transcription['segments']
@@ -114,7 +155,7 @@ with st.sidebar:
             if not os.path.exists(f""):
                 os.mkdir(folder_name)
-            with st.spinner('Running process...'):
                 data_transcription = ate.transcribe()
                 segments = data_transcription['segments']
@@ -124,49 +165,18 @@ with st.sidebar:
         else:
             st.error("Please type in your youtube link or upload the PDF")
             st.experimental_rerun()
-        # Generate embeddings
-        if not os.path.exists(f"{folder_name}/word_embeddings.csv"):
-            for i, segment in enumerate(segments):
-                bar.progress(max(math.ceil((i/len(segments) * 50)), 1))
-                response = openai.Embedding.create(
-                    input= segment["text"].strip(),
-                    model="text-embedding-ada-002"
-                )
-                embeddings = response['data'][0]['embedding']
-                meta = {
-                    "text": segment["text"].strip(),
-                    "embedding": embeddings
-                }
-                data.append(meta)
-            pd.DataFrame(data).to_csv(f'{folder_name}/word_embeddings.csv')
-        else:
-            data = pd.read_csv(f'{folder_name}/word_embeddings.csv')
-            embeddings = data["embedding"]
-        bar.progress(75)
-        text_df = pd.DataFrame.from_dict({"title": [data_transcription["title"]], "text":[data_transcription["text"]]})
-        input_accepted = True
-        with st.spinner('Breaking up the text and doing analysis...'):
-            # For each body of text, create text chunks of a certain token size required for the transformer
-            title_entry = text_df['title'][0]
-            print(title_entry)
-            for i in range(0, len(text_df)):
-                nested_sentences = md.create_nest_sentences(document=text_df['text'][i], token_max_length=1024)
-                # For each chunk of sentences (within the token max)
-                text_chunks = []
-                for n in range(0, len(nested_sentences)):
-                    tc = " ".join(map(str, nested_sentences[n]))
-                    text_chunks.append(tc)
-                text_chunks_lib[title_entry] = text_chunks
-            # Generate key takeaways
-            key_engine = Keywords(title_entry)
-            keywords = key_engine.get_keywords(text_chunks_lib)
         # Generate the summary
         if gen_summary == 'Yes':

 import os, json
 import math
 import re
+from threading import Thread
 # Custom classes
 from transcription import *
 embeddings = []
 text_chunks_lib = dict()
 user_input = None
+title_entry = None
 tldr = ""
 summary = ""
 takeaways = []
+keywords = []
 folder_name = "./tests"
 input_accepted = False
 bar = st.progress(0)
+def generate_word_embeddings():
+    if not os.path.exists(f"{folder_name}/word_embeddings.csv"):
+        for i, segment in enumerate(segments):
+            bar.progress(max(math.ceil((i/len(segments) * 50)), 1))
+            response = openai.Embedding.create(
+                input= segment["text"].strip(),
+                model="text-embedding-ada-002"
+            )
+            embeddings = response['data'][0]['embedding']
+            meta = {
+                "text": segment["text"].strip(),
+                "embedding": embeddings
+            }
+            data.append(meta)
+        pd.DataFrame(data).to_csv(f'{folder_name}/word_embeddings.csv')
+    else:
+        data = pd.read_csv(f'{folder_name}/word_embeddings.csv')
+def generate_text_chunks_lib():
+    text_df = pd.DataFrame.from_dict({"title": [data_transcription["title"]], "text":[data_transcription["text"]]})
+    input_accepted = True
+    # For each body of text, create text chunks of a certain token size required for the transformer
+    title_entry = text_df['title'][0]
+    print(title_entry)
+    for i in range(0, len(text_df)):
+        nested_sentences = md.create_nest_sentences(document=text_df['text'][i], token_max_length=1024)
+        # For each chunk of sentences (within the token max)
+        text_chunks = []
+        for n in range(0, len(nested_sentences)):
+            tc = " ".join(map(str, nested_sentences[n]))
+            text_chunks.append(tc)
+        text_chunks_lib[title_entry] = text_chunks
+    # Generate key takeaways
+    key_engine = Keywords(title_entry)
+    keywords = key_engine.get_keywords(text_chunks_lib)
 # =========== SIDEBAR FOR GENERATION ===========
 with st.sidebar:
     youtube_link = st.text_input(label = "Type in your Youtube link", placeholder = "", key="url")
     if st.button("Start Analysis"):
+        # Youtube Transcription
         if re.search(REGEXP_YOUTUBE_URL, youtube_link):
             vte = VideoTranscription(youtube_link)
             YOUTUBE_VIDEO_ID = youtube_link.split("=")[1]
             if not os.path.exists(folder_name):
                 os.mkdir(folder_name)
+            with st.spinner('Running transcription...'):
                 data_transcription = vte.transcribe()
                 segments = data_transcription['segments']
         # PDF Transcription
         elif pdf_file is not None:
             pte = PDFTranscription(pdf_file)
             if not os.path.exists(folder_name):
                 os.mkdir(folder_name)
+            with st.spinner('Running transcription...'):
                 data_transcription = pte.transcribe()
                 segments = data_transcription['segments']
             if not os.path.exists(f""):
                 os.mkdir(folder_name)
+            with st.spinner('Running transcription...'):
                 data_transcription = ate.transcribe()
                 segments = data_transcription['segments']
         else:
             st.error("Please type in your youtube link or upload the PDF")
             st.experimental_rerun()
+        # Generate embeddings
+        thread1 = Thread(target=generate_word_embeddings)
+        thread1.start()
+        # Generate text chunks
+        thread2 = Thread(target=generate_text_chunks_lib)
+        thread2.start()
+        # Wait for them to complete
+        thread1.join()
+        thread2.join()
         # Generate the summary
         if gen_summary == 'Yes':