ScientiaEtVeritas commited on
Commit
5402b60
·
1 Parent(s): 1c71e7c

Allow different modes: preloaded, document, YT video and custom text

Browse files
Files changed (2) hide show
  1. app.py +79 -10
  2. requirements.txt +2 -1
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import itertools
2
  import json
3
  import re
 
4
  from functools import partial
5
  from pathlib import Path
6
 
@@ -11,6 +12,25 @@ import streamlit as st
11
  from generate_text_api import SummarizerGenerator
12
  from model_inferences.utils.files import get_captions_from_vtt, get_transcript
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  USE_PARAGRAPHING_MODEL = True
15
 
16
  def get_sublist_by_flattened_index(A, i):
@@ -105,11 +125,13 @@ class Toc:
105
  st.markdown(f"<{level} id='{key}'>{text}</{level}>", unsafe_allow_html=True)
106
  self._items.append(f"{space}* <a href='#{key}'>{text}</a>")
107
 
108
- endpoint = "http://hiaisc.isl.iar.kit.edu/summarize/summarize_stream"
109
 
110
- client = OfflineTextSegmenterClient("http://hiaisc.isl.iar.kit.edu/chapterize")
 
 
111
  if USE_PARAGRAPHING_MODEL:
112
- paragrapher = OfflineTextSegmenterClient("http://hiaisc.isl.iar.kit.edu/paragraph")
113
  summarizer = SummarizerGenerator(endpoint)
114
 
115
  import re
@@ -177,25 +199,72 @@ if not hasattr(st, 'global_state'):
177
  transcripts_map["Machine Translation: " + lecture_id] = {"transcript": transcript, "video": video_path}
178
  st.global_state['KIT Lectures'] = transcripts_map
179
 
180
- type_of_document = st.selectbox('What kind of document do you want to test it on?', list(st.global_state.keys()))
 
 
 
 
181
 
182
- transcripts_map = st.global_state[type_of_document]
 
 
183
 
184
- selected_talk = st.selectbox("Choose a document...", list(transcripts_map.keys()))
185
 
186
- st.video(str(transcripts_map[selected_talk]['video']), format="video/mp4", start_time=0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
 
188
- input_text = st.text_area("Transcript", value=transcripts_map[selected_talk]['transcript'], height=300)
 
 
 
 
189
 
190
  toc = Toc()
191
 
192
  summarization_todos = []
193
 
194
  with st.expander("Adjust Thresholds"):
195
- threshold = st.slider('Chapter Segmentation Threshold', 0.00, 1.00, value=0.4, step=0.05)
196
  paragraphing_threshold = st.slider('Paragraphing Threshold', 0.00, 1.00, value=0.5, step=0.05)
197
 
198
- if st.button("Process Transcript"):
199
  with st.sidebar:
200
  st.header("Table of Contents")
201
  toc.placeholder()
 
1
  import itertools
2
  import json
3
  import re
4
+ from collections import defaultdict
5
  from functools import partial
6
  from pathlib import Path
7
 
 
12
  from generate_text_api import SummarizerGenerator
13
  from model_inferences.utils.files import get_captions_from_vtt, get_transcript
14
 
15
+
16
+ def segmented_control(labels, key, default = None, max_size = 3) -> str:
17
+ """Group of buttons with the given labels. Return the selected label."""
18
+ if key not in st.session_state:
19
+ st.session_state[key] = default or labels[0]
20
+
21
+ selected_label = st.session_state[key]
22
+
23
+ def set_label(label: str) -> None:
24
+ st.session_state.update(**{key: label})
25
+
26
+ cols = st.columns([1] * len(labels))
27
+
28
+ for col, label in zip(cols, labels):
29
+ btn_type = "primary" if selected_label == label else "secondary"
30
+ col.button(label, on_click=set_label, args=(label,), use_container_width=True, type=btn_type)
31
+
32
+ return selected_label
33
+
34
  USE_PARAGRAPHING_MODEL = True
35
 
36
  def get_sublist_by_flattened_index(A, i):
 
125
  st.markdown(f"<{level} id='{key}'>{text}</{level}>", unsafe_allow_html=True)
126
  self._items.append(f"{space}* <a href='#{key}'>{text}</a>")
127
 
128
+ import os
129
 
130
+ endpoint = os.getenv('summarize_stream_url')
131
+
132
+ client = OfflineTextSegmenterClient(os.getenv('chapterize_url'))
133
  if USE_PARAGRAPHING_MODEL:
134
+ paragrapher = OfflineTextSegmenterClient(os.getenv('paragraph_url'))
135
  summarizer = SummarizerGenerator(endpoint)
136
 
137
  import re
 
199
  transcripts_map["Machine Translation: " + lecture_id] = {"transcript": transcript, "video": video_path}
200
  st.global_state['KIT Lectures'] = transcripts_map
201
 
202
+ #preloaded_document, youtube_video, custom_text = st.tabs(["Preloaded Document", "YouTube Video", "Custom Text"])
203
+ selected = segmented_control(["Preloaded Document", "YouTube Video", "Custom Text"], default="Preloaded Document", key="tabs")
204
+
205
+ input_text = ""
206
+ transcripts_map = defaultdict(dict)
207
 
208
+ if selected == "Preloaded Document":
209
+ print("Preloaded Document")
210
+ type_of_document = st.selectbox('What kind of document do you want to test it on?', list(st.global_state.keys()))
211
 
212
+ transcripts_map = st.global_state[type_of_document]
213
 
214
+ selected_talk = st.selectbox("Choose a document...", list(transcripts_map.keys()))
215
+
216
+ st.video(str(transcripts_map[selected_talk]['video']), format="video/mp4", start_time=0)
217
+
218
+ input_text = st.text_area("Transcript", value=transcripts_map[selected_talk]['transcript'], height=300)
219
+
220
+ from youtube_transcript_api import NoTranscriptFound, TranscriptsDisabled, YouTubeTranscriptApi
221
+
222
+
223
+ def get_transcript(video_id, lang="en"):
224
+ try:
225
+ transcripts = YouTubeTranscriptApi.list_transcripts(video_id)
226
+ transcript = transcripts.find_manually_created_transcript([lang]).fetch()
227
+ except NoTranscriptFound:
228
+ return transcripts.find_manually_created_transcript(["en", "en-US", "en-GB", "en-CA"]).fetch()
229
+ return transcript
230
+
231
+ def get_title(video_url):
232
+ response = requests.get(f"https://noembed.com/embed?dataType=json&url={video_url}")
233
+ result = response.json()
234
+ return result["title"]
235
+
236
+ if selected == "YouTube Video":
237
+ print("YouTube Video")
238
+ video_url = st.text_input("Enter YouTube Link", value="https://www.youtube.com/watch?v=YuIc4mq7zMU")
239
+ video_id = video_url.split("v=")[-1]
240
+ try:
241
+ subs = get_transcript(video_id)
242
+ selected_talk = get_title(video_url)
243
+ except (TranscriptsDisabled, NoTranscriptFound):
244
+ subs = None
245
+ if subs is not None:
246
+ st.video(video_url, format="video/mp4", start_time=0)
247
+ input_text = " ".join([sub["text"] for sub in subs])
248
+ input_text = re.sub(r'\n+', r' ', input_text).replace(" ", " ")
249
+ input_text = st.text_area("Transcript", value=input_text, height=300)
250
+ else:
251
+ st.error("No transcript found for this video.")
252
 
253
+ if selected == "Custom Text":
254
+ print("Custom Text")
255
+ input_text = st.text_area("Transcript", height=300, placeholder="Insert your transcript here...")
256
+ input_text = re.sub(r'\n+', r' ', input_text)
257
+ selected_talk = "Your Transcript"
258
 
259
  toc = Toc()
260
 
261
  summarization_todos = []
262
 
263
  with st.expander("Adjust Thresholds"):
264
+ threshold = st.slider('Chapter Segmentation Threshold', 0.00, 1.00, value=0.5, step=0.05)
265
  paragraphing_threshold = st.slider('Paragraphing Threshold', 0.00, 1.00, value=0.5, step=0.05)
266
 
267
+ if st.button("Process Transcript", disabled=not bool(input_text.strip())):
268
  with st.sidebar:
269
  st.header("Table of Contents")
270
  toc.placeholder()
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  requests
2
  pandas
3
  nltk
4
- webvtt-py
 
 
1
  requests
2
  pandas
3
  nltk
4
+ webvtt-py
5
+ youtube_transcript_api