import itertools import json import re from collections import defaultdict from functools import partial from pathlib import Path import pandas as pd import requests import streamlit as st from generate_text_api import SummarizerGenerator from model_inferences.utils.files import get_captions_from_vtt, get_transcript def segmented_control(labels, key, default = None, max_size = 3) -> str: """Group of buttons with the given labels. Return the selected label.""" if key not in st.session_state: st.session_state[key] = default or labels[0] selected_label = st.session_state[key] def set_label(label: str) -> None: st.session_state.update(**{key: label}) cols = st.columns([1] * len(labels)) for col, label in zip(cols, labels): btn_type = "primary" if selected_label == label else "secondary" col.button(label, on_click=set_label, args=(label,), use_container_width=True, type=btn_type) return selected_label USE_PARAGRAPHING_MODEL = True def get_sublist_by_flattened_index(A, i): current_index = 0 for sublist in A: sublist_length = len(sublist) if current_index <= i < current_index + sublist_length: return sublist, A.index(sublist) current_index += sublist_length return None, None import requests def get_talk_metadata(video_id): url = "https://www.ted.com/graphql" headers = { "Content-Type": "application/json", "Accept": "application/json", "x-operation-name": "Transcript", # Replace with the actual operation name } data = { "query": """ query GetTalk($videoId: ID!) { video(id: $videoId) { title, presenterDisplayName, nativeDownloads {medium} } } """, "variables": { "videoId": video_id, # Corrected key to "videoId" }, } response = requests.post(url, json=data, headers=headers) if response.status_code == 200: result = response.json() return result else: print(f"Error: {response.status_code}, {response.text}") class OfflineTextSegmenterClient: def __init__(self, host_url): self.host_url = host_url.rstrip("/") + "/segment" def segment(self, text, captions=None, generate_titles=False, threshold=0.4): payload = { 'text': text, 'captions': captions, 'generate_titles': generate_titles, "prefix_titles": True, "threshold": threshold, } headers = { 'Content-Type': 'application/json' } response = requests.post(self.host_url, data=json.dumps(payload), headers=headers).json() #segments = response["annotated_segments"] if "annotated_segments" in response else response["segments"] return {'segments':response["segments"], 'titles': response["titles"], 'sentences': response["sentences"]} class Toc: def __init__(self): self._items = [] self._placeholder = None def title(self, text): self._markdown(text, "h1") def header(self, text): self._markdown(text, "h2", " " * 2) def subheader(self, text): self._markdown(text, "h3", " " * 4) def placeholder(self, sidebar=False): self._placeholder = st.sidebar.empty() if sidebar else st.empty() def generate(self): if self._placeholder: self._placeholder.markdown("\n".join(self._items), unsafe_allow_html=True) def _markdown(self, text, level, space=""): key = re.sub(r'[^\w-]', '', text.replace(" ", "-").replace("'", "-").lower()) st.markdown(f"<{level} id='{key}'>{text}</{level}>", unsafe_allow_html=True) self._items.append(f"{space}* <a href='#{key}'>{text}</a>") import os endpoint = os.getenv('summarize_stream_url') client = OfflineTextSegmenterClient(os.getenv('chapterize_url')) if USE_PARAGRAPHING_MODEL: paragrapher = OfflineTextSegmenterClient(os.getenv('paragraph_url')) summarizer = SummarizerGenerator(endpoint) import re def replace_newlines(text): updated_text = re.sub(r'\n+', r'\n\n', text) return updated_text def generate_summary(summarizer, generated_text_box, input_, prefix=""): all_generated_text = prefix for generated_text in summarizer.generate_summary_stream(input_): all_generated_text += replace_newlines(generated_text) generated_text_box.info(all_generated_text) print(all_generated_text) return all_generated_text.strip() st.header("Demo: Intelligent Recap") if not hasattr(st, 'global_state'): st.global_state = {'NIPS 2021 Talks': None, 'TED Talks': None} # NIPS 2021 Talks transcript_files = itertools.islice(Path("demo_data/nips-2021/").rglob("transcript_whisper_large-v2.vtt"), 15) # get titles from metadata.json transcripts_map = {} for transcript_file in transcript_files: base_path = transcript_file.parent metadata = base_path / "metadata.json" txt_file = base_path / "transcript_whisper_large-v2.txt" with open(metadata) as f: metadata = json.load(f) title = metadata["title"] transcript = get_transcript(txt_file) captions = get_captions_from_vtt(transcript_file) transcripts_map[title] = {"transcript": transcript, "captions": captions, "video": base_path / "video.mp4"} st.global_state['NIPS 2021 Talks'] = transcripts_map data = pd.read_json("demo_data/ted_talks.json") video_ids = data.talk_id.tolist() transcripts = data.text.apply(lambda x: " ".join(x)).tolist() transcripts_map = {} for video_id, transcript in zip(video_ids, transcripts): metadata = get_talk_metadata(video_id) title = metadata["data"]["video"]["title"] presenter = metadata["data"]["video"]["presenterDisplayName"] print(metadata["data"]) if metadata["data"]["video"]["nativeDownloads"] is None: continue video_url = metadata["data"]["video"]["nativeDownloads"]["medium"] transcripts_map[title] = {"transcript": transcript, "video": video_url, "presenter": presenter} st.global_state['TED Talks'] = transcripts_map def get_lecture_id(path): return int(path.parts[-2].split('-')[1]) transcript_files = Path("demo_data/lectures/").rglob("English.vtt") sorted_path_list = sorted(transcript_files, key=get_lecture_id) transcripts_map = {} for transcript_file in sorted_path_list: base_path = transcript_file.parent lecture_id = base_path.parts[-1] transcript = " ".join([c["text"].strip() for c in get_captions_from_vtt(transcript_file)]).replace("\n", " ") video_path = Path(base_path, "video.mp4") transcripts_map["Machine Translation: " + lecture_id] = {"transcript": transcript, "video": video_path} st.global_state['KIT Lectures'] = transcripts_map #preloaded_document, youtube_video, custom_text = st.tabs(["Preloaded Document", "YouTube Video", "Custom Text"]) selected = segmented_control(["Preloaded Document", "YouTube Video", "Custom Text"], default="Preloaded Document", key="tabs") input_text = "" transcripts_map = defaultdict(dict) if selected == "Preloaded Document": print("Preloaded Document") type_of_document = st.selectbox('What kind of document do you want to test it on?', list(st.global_state.keys())) transcripts_map = st.global_state[type_of_document] selected_talk = st.selectbox("Choose a document...", list(transcripts_map.keys())) st.video(str(transcripts_map[selected_talk]['video']), format="video/mp4", start_time=0) input_text = st.text_area("Transcript", value=transcripts_map[selected_talk]['transcript'], height=300) from youtube_transcript_api import NoTranscriptFound, TranscriptsDisabled, YouTubeTranscriptApi def get_transcript(video_id, lang="en"): try: transcripts = YouTubeTranscriptApi.list_transcripts(video_id) transcript = transcripts.find_manually_created_transcript([lang]).fetch() except NoTranscriptFound: return transcripts.find_manually_created_transcript(["en", "en-US", "en-GB", "en-CA"]).fetch() return transcript def get_title(video_url): response = requests.get(f"https://noembed.com/embed?dataType=json&url={video_url}") result = response.json() return result["title"] if selected == "YouTube Video": print("YouTube Video") video_url = st.text_input("Enter YouTube Link", value="https://www.youtube.com/watch?v=YuIc4mq7zMU") video_id = video_url.split("v=")[-1] try: subs = get_transcript(video_id) selected_talk = get_title(video_url) except (TranscriptsDisabled, NoTranscriptFound): subs = None if subs is not None: st.video(video_url, format="video/mp4", start_time=0) input_text = " ".join([sub["text"] for sub in subs]) input_text = re.sub(r'\n+', r' ', input_text).replace(" ", " ") input_text = st.text_area("Transcript", value=input_text, height=300) else: st.error("No transcript found for this video.") if selected == "Custom Text": print("Custom Text") input_text = st.text_area("Transcript", height=300, placeholder="Insert your transcript here...") input_text = re.sub(r'\n+', r' ', input_text) selected_talk = "Your Transcript" toc = Toc() summarization_todos = [] with st.expander("Adjust Thresholds"): threshold = st.slider('Chapter Segmentation Threshold', 0.00, 1.00, value=0.5, step=0.05) paragraphing_threshold = st.slider('Paragraphing Threshold', 0.00, 1.00, value=0.5, step=0.05) if st.button("Process Transcript", disabled=not bool(input_text.strip())): with st.sidebar: st.header("Table of Contents") toc.placeholder() st.header(selected_talk, divider='rainbow') # if 'presenter' in transcripts_map[selected_talk]: # st.markdown(f"### *by **{transcripts_map[selected_talk]['presenter']}***") captions = transcripts_map[selected_talk]['captions'] if 'captions' in transcripts_map[selected_talk] else None result = client.segment(input_text, captions, generate_titles=True, threshold=threshold) if USE_PARAGRAPHING_MODEL: presult = paragrapher.segment(input_text, captions, generate_titles=False, threshold=paragraphing_threshold) paragraphs = presult['segments'] segments, titles, sentences = result['segments'], result['titles'], result['sentences'] if USE_PARAGRAPHING_MODEL: prev_chapter_idx = 0 prev_paragraph_idx = 0 segment = [] for i, sentence in enumerate(sentences): chapter, chapter_idx = get_sublist_by_flattened_index(segments, i) paragraph, paragraph_idx = get_sublist_by_flattened_index(paragraphs, i) if (chapter_idx != prev_chapter_idx and paragraph_idx == prev_paragraph_idx) or (paragraph_idx != prev_paragraph_idx and chapter_idx != prev_chapter_idx): print("Chapter / Chapter & Paragraph") segment_text = " ".join(segment) toc.subheader(titles[prev_chapter_idx]) if len(segment_text) > 450: generated_text_box = st.info("") summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment_text)) st.write(segment_text) segment = [] elif paragraph_idx != prev_paragraph_idx and chapter_idx == prev_chapter_idx: print("Paragraph") segment.append("\n\n") segment.append(sentence) prev_chapter_idx = chapter_idx prev_paragraph_idx = paragraph_idx segment_text = " ".join(segment) toc.subheader(titles[prev_chapter_idx]) generated_text_box = st.info("") summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment_text)) st.write(segment_text) else: segments = [" ".join([sentence for sentence in segment]) for segment in segments] for title, segment in zip(titles, segments): toc.subheader(title) generated_text_box = st.info("") summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment)) st.write(segment) toc.generate() for summarization_todo in summarization_todos: summarization_todo()