youtube_summarizer / text_summary.py
ammansik's picture
update
2458b22
import os
import textwrap
import numpy as np
import openai
from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from sklearn.cluster import KMeans
from tenacity import stop_after_attempt # for exponential backoff
from tenacity import retry, wait_random_exponential
DEFAULT_PROMPT = (
"Summarize this Youtube video chapter. Always start with a topical sentence: "
)
CHAPTER_TITLE = "Give a title to this video chapter based on the transcript: "
title_template = "Give a title to this text summary: {text}"
TITLE_PROMPT = PromptTemplate(template=title_template, input_variables=["text"])
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def get_embeddings(text_chunks, openai_api_key, model="text-embedding-ada-002"):
data = openai.Embedding.create(
input=text_chunks, model=model, openai_api_key=openai_api_key
)["data"]
embeddings = [item["embedding"] for item in data]
return np.array(embeddings)
def text_from_file(text_path):
in_text = ""
with open(text_path, "r", encoding="utf-8") as text_file:
for line in text_file:
in_text += line
return in_text
def get_chunks(timestamped_transcripts, chunk_lines):
chunks = []
current_chunk = []
for line in timestamped_transcripts:
current_chunk.append(line)
if len(current_chunk) == chunk_lines:
chunks.append("\n".join(current_chunk))
current_chunk = []
if len(current_chunk) > 0:
chunks.append("\n".join(current_chunk))
return chunks
def align_chapters(timestamped_transcript, yt_chapters):
timestamped_transcripts = timestamped_transcript.strip().split("\n")
chapters = []
chapter_text = ""
chapter_start_time = 0.0
prev_end_time = 0.0
chapter_index = 0
for idx, trn in enumerate(timestamped_transcripts):
trn_start_time = float(trn.split()[0])
trn_end_time = float(trn.split()[1])
trn_text = " ".join(trn.split()[2:])
if idx == 0:
chapter_start_time = trn_start_time
next_index = min(chapter_index + 1, len(yt_chapters) - 1)
if trn_start_time >= yt_chapters[next_index]["start_time"]:
if len(chapters) == len(yt_chapters):
chapter_text += f"{trn_text}\n"
else:
chapters.append(
{
"text": chapter_text,
"start_time": chapter_start_time,
"end_time": prev_end_time,
"title": yt_chapters[chapter_index]["title"],
}
)
chapter_text = trn_text
chapter_start_time = trn_start_time
chapter_index += 1
else:
chapter_text += f"{trn_text}\n"
prev_end_time = trn_end_time
if len(chapters) == len(yt_chapters):
chapter_index = len(yt_chapters) - 1
chapters[chapter_index]["text"] += chapter_text
chapters[chapter_index]["end_time"] = prev_end_time
return chapters
def get_automatic_chapters(
timestamped_transcript, openai_api_key, chunk_lines=5, num_clusters=3
):
timestamped_transcripts = [
timestamped_line
for timestamped_line in timestamped_transcript.split("\n")
if len(timestamped_line.strip()) > 0
]
# Split into chunks
text_chunks = get_chunks(timestamped_transcripts, chunk_lines)
embeddings = get_embeddings(text_chunks, openai_api_key)
# Creating and fitting the K-means model
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(embeddings)
# Getting the cluster labels
cluster_labels = kmeans.labels_
current_label = -1
current_text = ""
chapters = []
for idx, (text_chunk, label) in enumerate(zip(text_chunks, cluster_labels)):
start_time, end_time = get_chunk_timestamps(text_chunk)
if idx == 0:
chapter_start_time = start_time
if label != current_label and current_label != -1:
chapters.append(
{
"text": current_text,
"start_time": chapter_start_time,
"end_time": prev_end_time,
"title": "",
}
)
current_text = ""
chapter_start_time = start_time
current_label = label
current_text += get_chunk_text(text_chunk)
prev_end_time = end_time
if len(current_text) > 0:
chapters.append(
{
"text": current_text,
"start_time": chapter_start_time,
"end_time": prev_end_time,
"title": "",
}
)
return chapters
def get_chunk_timestamps(chunk):
start_time = float(chunk.strip().split("\n")[0].split()[0])
end_time = float(chunk.strip().split("\n")[-1].split()[1])
return start_time, end_time
def get_chunk_text(chunk):
chunk_text = ""
for chunk_line in chunk.strip().split("\n"):
chunk_text += " ".join(chunk_line.split()[2:])
return chunk_text
def summarize_chapters(chapters, openai_api_key):
llm = OpenAI(temperature=0.9, openai_api_key=openai_api_key)
chapter_docs = [Document(page_content=chapter["text"]) for chapter in chapters]
summary_chain = load_summarize_chain(
llm, chain_type="map_reduce", return_intermediate_steps=True
)
summaries = summary_chain(
{"input_documents": chapter_docs}, return_only_outputs=True
)
summary_docs = [
Document(page_content=summary) for summary in summaries["intermediate_steps"]
]
title_chain = load_summarize_chain(
llm,
chain_type="map_reduce",
return_intermediate_steps=True,
map_prompt=TITLE_PROMPT,
)
titles = title_chain({"input_documents": summary_docs}, return_only_outputs=True)
summarized_chapters = []
for chapter, chapter_summary, chapter_title in zip(
chapters, summaries["intermediate_steps"], titles["intermediate_steps"]
):
if len(chapter["title"]) > 0:
chapter_title = chapter["title"]
summarized_chapters.append(
{
"start": chapter["start_time"],
"end": chapter["end_time"],
"text": chapter["text"],
"title": chapter_title.strip(),
"summary": chapter_summary.strip(),
}
)
return summarized_chapters, summaries["output_text"]