import os import textwrap import numpy as np import openai from langchain.chains.summarize import load_summarize_chain from langchain.docstore.document import Document from langchain.llms import OpenAI from langchain.prompts import PromptTemplate from sklearn.cluster import KMeans from tenacity import stop_after_attempt # for exponential backoff from tenacity import retry, wait_random_exponential DEFAULT_PROMPT = ( "Summarize this Youtube video chapter. Always start with a topical sentence: " ) CHAPTER_TITLE = "Give a title to this video chapter based on the transcript: " title_template = "Give a title to this text summary: {text}" TITLE_PROMPT = PromptTemplate(template=title_template, input_variables=["text"]) @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6)) def get_embeddings(text_chunks, openai_api_key, model="text-embedding-ada-002"): data = openai.Embedding.create( input=text_chunks, model=model, openai_api_key=openai_api_key )["data"] embeddings = [item["embedding"] for item in data] return np.array(embeddings) def text_from_file(text_path): in_text = "" with open(text_path, "r", encoding="utf-8") as text_file: for line in text_file: in_text += line return in_text def get_chunks(timestamped_transcripts, chunk_lines): chunks = [] current_chunk = [] for line in timestamped_transcripts: current_chunk.append(line) if len(current_chunk) == chunk_lines: chunks.append("\n".join(current_chunk)) current_chunk = [] if len(current_chunk) > 0: chunks.append("\n".join(current_chunk)) return chunks def align_chapters(timestamped_transcript, yt_chapters): timestamped_transcripts = timestamped_transcript.strip().split("\n") chapters = [] chapter_text = "" chapter_start_time = 0.0 prev_end_time = 0.0 chapter_index = 0 for idx, trn in enumerate(timestamped_transcripts): trn_start_time = float(trn.split()[0]) trn_end_time = float(trn.split()[1]) trn_text = " ".join(trn.split()[2:]) if idx == 0: chapter_start_time = trn_start_time next_index = min(chapter_index + 1, len(yt_chapters) - 1) if trn_start_time >= yt_chapters[next_index]["start_time"]: if len(chapters) == len(yt_chapters): chapter_text += f"{trn_text}\n" else: chapters.append( { "text": chapter_text, "start_time": chapter_start_time, "end_time": prev_end_time, "title": yt_chapters[chapter_index]["title"], } ) chapter_text = trn_text chapter_start_time = trn_start_time chapter_index += 1 else: chapter_text += f"{trn_text}\n" prev_end_time = trn_end_time if len(chapters) == len(yt_chapters): chapter_index = len(yt_chapters) - 1 chapters[chapter_index]["text"] += chapter_text chapters[chapter_index]["end_time"] = prev_end_time return chapters def get_automatic_chapters( timestamped_transcript, openai_api_key, chunk_lines=5, num_clusters=3 ): timestamped_transcripts = [ timestamped_line for timestamped_line in timestamped_transcript.split("\n") if len(timestamped_line.strip()) > 0 ] # Split into chunks text_chunks = get_chunks(timestamped_transcripts, chunk_lines) embeddings = get_embeddings(text_chunks, openai_api_key) # Creating and fitting the K-means model kmeans = KMeans(n_clusters=num_clusters) kmeans.fit(embeddings) # Getting the cluster labels cluster_labels = kmeans.labels_ current_label = -1 current_text = "" chapters = [] for idx, (text_chunk, label) in enumerate(zip(text_chunks, cluster_labels)): start_time, end_time = get_chunk_timestamps(text_chunk) if idx == 0: chapter_start_time = start_time if label != current_label and current_label != -1: chapters.append( { "text": current_text, "start_time": chapter_start_time, "end_time": prev_end_time, "title": "", } ) current_text = "" chapter_start_time = start_time current_label = label current_text += get_chunk_text(text_chunk) prev_end_time = end_time if len(current_text) > 0: chapters.append( { "text": current_text, "start_time": chapter_start_time, "end_time": prev_end_time, "title": "", } ) return chapters def get_chunk_timestamps(chunk): start_time = float(chunk.strip().split("\n")[0].split()[0]) end_time = float(chunk.strip().split("\n")[-1].split()[1]) return start_time, end_time def get_chunk_text(chunk): chunk_text = "" for chunk_line in chunk.strip().split("\n"): chunk_text += " ".join(chunk_line.split()[2:]) return chunk_text def summarize_chapters(chapters, openai_api_key): llm = OpenAI(temperature=0.9, openai_api_key=openai_api_key) chapter_docs = [Document(page_content=chapter["text"]) for chapter in chapters] summary_chain = load_summarize_chain( llm, chain_type="map_reduce", return_intermediate_steps=True ) summaries = summary_chain( {"input_documents": chapter_docs}, return_only_outputs=True ) summary_docs = [ Document(page_content=summary) for summary in summaries["intermediate_steps"] ] title_chain = load_summarize_chain( llm, chain_type="map_reduce", return_intermediate_steps=True, map_prompt=TITLE_PROMPT, ) titles = title_chain({"input_documents": summary_docs}, return_only_outputs=True) summarized_chapters = [] for chapter, chapter_summary, chapter_title in zip( chapters, summaries["intermediate_steps"], titles["intermediate_steps"] ): if len(chapter["title"]) > 0: chapter_title = chapter["title"] summarized_chapters.append( { "start": chapter["start_time"], "end": chapter["end_time"], "text": chapter["text"], "title": chapter_title.strip(), "summary": chapter_summary.strip(), } ) return summarized_chapters, summaries["output_text"]