Spaces:

retkowski
/

ytseg_demo

Running

App Files Files Community

retkowski commited on Aug 5, 2024

Commit

5402b60

1 Parent(s): 1c71e7c

Allow different modes: preloaded, document, YT video and custom text

Browse files

Files changed (2) hide show

app.py +79 -10
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import itertools
 import json
 import re
 from functools import partial
 from pathlib import Path
@@ -11,6 +12,25 @@ import streamlit as st
 from generate_text_api import SummarizerGenerator
 from model_inferences.utils.files import get_captions_from_vtt, get_transcript
 USE_PARAGRAPHING_MODEL = True
 def get_sublist_by_flattened_index(A, i):
@@ -105,11 +125,13 @@ class Toc:
         st.markdown(f"<{level} id='{key}'>{text}</{level}>", unsafe_allow_html=True)
         self._items.append(f"{space}* <a href='#{key}'>{text}</a>")
-endpoint = "http://hiaisc.isl.iar.kit.edu/summarize/summarize_stream"
-client = OfflineTextSegmenterClient("http://hiaisc.isl.iar.kit.edu/chapterize")
 if USE_PARAGRAPHING_MODEL:
-    paragrapher = OfflineTextSegmenterClient("http://hiaisc.isl.iar.kit.edu/paragraph")
 summarizer = SummarizerGenerator(endpoint)
 import re
@@ -177,25 +199,72 @@ if not hasattr(st, 'global_state'):
         transcripts_map["Machine Translation: " + lecture_id] = {"transcript": transcript, "video": video_path}
     st.global_state['KIT Lectures'] = transcripts_map
-type_of_document = st.selectbox('What kind of document do you want to test it on?', list(st.global_state.keys()))
-transcripts_map = st.global_state[type_of_document]
-selected_talk = st.selectbox("Choose a document...", list(transcripts_map.keys()))
-st.video(str(transcripts_map[selected_talk]['video']), format="video/mp4", start_time=0)
-input_text = st.text_area("Transcript", value=transcripts_map[selected_talk]['transcript'], height=300)
 toc = Toc()
 summarization_todos = []
 with st.expander("Adjust Thresholds"):
-    threshold = st.slider('Chapter Segmentation Threshold', 0.00, 1.00, value=0.4, step=0.05)
     paragraphing_threshold = st.slider('Paragraphing Threshold', 0.00, 1.00, value=0.5, step=0.05)
-if st.button("Process Transcript"):
     with st.sidebar:
         st.header("Table of Contents")
         toc.placeholder()

 import itertools
 import json
 import re
+from collections import defaultdict
 from functools import partial
 from pathlib import Path
 from generate_text_api import SummarizerGenerator
 from model_inferences.utils.files import get_captions_from_vtt, get_transcript
+def segmented_control(labels, key, default = None, max_size = 3) -> str:
+    """Group of buttons with the given labels. Return the selected label."""
+    if key not in st.session_state:
+        st.session_state[key] = default or labels[0]
+    selected_label = st.session_state[key]
+    def set_label(label: str) -> None:
+        st.session_state.update(**{key: label})
+    cols = st.columns([1] * len(labels))
+    for col, label in zip(cols, labels):
+        btn_type = "primary" if selected_label == label else "secondary"
+        col.button(label, on_click=set_label, args=(label,), use_container_width=True, type=btn_type)
+    return selected_label
 USE_PARAGRAPHING_MODEL = True
 def get_sublist_by_flattened_index(A, i):
         st.markdown(f"<{level} id='{key}'>{text}</{level}>", unsafe_allow_html=True)
         self._items.append(f"{space}* <a href='#{key}'>{text}</a>")
+import os
+endpoint = os.getenv('summarize_stream_url')
+client = OfflineTextSegmenterClient(os.getenv('chapterize_url'))
 if USE_PARAGRAPHING_MODEL:
+    paragrapher = OfflineTextSegmenterClient(os.getenv('paragraph_url'))
 summarizer = SummarizerGenerator(endpoint)
 import re
         transcripts_map["Machine Translation: " + lecture_id] = {"transcript": transcript, "video": video_path}
     st.global_state['KIT Lectures'] = transcripts_map
+#preloaded_document, youtube_video, custom_text = st.tabs(["Preloaded Document", "YouTube Video", "Custom Text"])
+selected = segmented_control(["Preloaded Document", "YouTube Video", "Custom Text"], default="Preloaded Document", key="tabs")
+input_text = ""
+transcripts_map = defaultdict(dict)
+if selected == "Preloaded Document":
+    print("Preloaded Document")
+    type_of_document = st.selectbox('What kind of document do you want to test it on?', list(st.global_state.keys()))
+    transcripts_map = st.global_state[type_of_document]
+    selected_talk = st.selectbox("Choose a document...", list(transcripts_map.keys()))
+    st.video(str(transcripts_map[selected_talk]['video']), format="video/mp4", start_time=0)
+    input_text = st.text_area("Transcript", value=transcripts_map[selected_talk]['transcript'], height=300)
+from youtube_transcript_api import NoTranscriptFound, TranscriptsDisabled, YouTubeTranscriptApi
+def get_transcript(video_id, lang="en"):
+  try:
+    transcripts = YouTubeTranscriptApi.list_transcripts(video_id)
+    transcript = transcripts.find_manually_created_transcript([lang]).fetch()
+  except NoTranscriptFound:
+    return transcripts.find_manually_created_transcript(["en", "en-US", "en-GB", "en-CA"]).fetch()
+  return transcript
+def get_title(video_url):
+    response = requests.get(f"https://noembed.com/embed?dataType=json&url={video_url}")
+    result = response.json()
+    return result["title"]
+if selected == "YouTube Video":
+    print("YouTube Video")
+    video_url = st.text_input("Enter YouTube Link", value="https://www.youtube.com/watch?v=YuIc4mq7zMU")
+    video_id = video_url.split("v=")[-1]
+    try:
+        subs = get_transcript(video_id)
+        selected_talk = get_title(video_url)
+    except (TranscriptsDisabled, NoTranscriptFound):
+        subs = None
+    if subs is not None:
+        st.video(video_url, format="video/mp4", start_time=0)
+        input_text = " ".join([sub["text"] for sub in subs])
+        input_text = re.sub(r'\n+', r' ', input_text).replace("  ", " ")
+        input_text = st.text_area("Transcript", value=input_text, height=300)
+    else:
+        st.error("No transcript found for this video.")
+if selected == "Custom Text":
+    print("Custom Text")
+    input_text = st.text_area("Transcript", height=300, placeholder="Insert your transcript here...")
+    input_text = re.sub(r'\n+', r' ', input_text)
+    selected_talk = "Your Transcript"
 toc = Toc()
 summarization_todos = []
 with st.expander("Adjust Thresholds"):
+    threshold = st.slider('Chapter Segmentation Threshold', 0.00, 1.00, value=0.5, step=0.05)
     paragraphing_threshold = st.slider('Paragraphing Threshold', 0.00, 1.00, value=0.5, step=0.05)
+if st.button("Process Transcript", disabled=not bool(input_text.strip())):
     with st.sidebar:
         st.header("Table of Contents")
         toc.placeholder()

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
 requests
 pandas
 nltk
-webvtt-py

 requests
 pandas
 nltk
+webvtt-py
+youtube_transcript_api