Spaces:

ammansik
/

youtube_summarizer

Sleeping

App Files Files Community

youtube_summarizer / text_summary.py

ammansik

update

2458b22 over 1 year ago

raw

history blame contribute delete

6.67 kB

	import os
	import textwrap

	import numpy as np
	import openai
	from langchain.chains.summarize import load_summarize_chain
	from langchain.docstore.document import Document
	from langchain.llms import OpenAI
	from langchain.prompts import PromptTemplate
	from sklearn.cluster import KMeans
	from tenacity import stop_after_attempt # for exponential backoff
	from tenacity import retry, wait_random_exponential

	DEFAULT_PROMPT = (
	"Summarize this Youtube video chapter. Always start with a topical sentence: "
	)
	CHAPTER_TITLE = "Give a title to this video chapter based on the transcript: "

	title_template = "Give a title to this text summary: {text}"
	TITLE_PROMPT = PromptTemplate(template=title_template, input_variables=["text"])

	@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
	def get_embeddings(text_chunks, openai_api_key, model="text-embedding-ada-002"):
	data = openai.Embedding.create(
	input=text_chunks, model=model, openai_api_key=openai_api_key
	)["data"]
	embeddings = [item["embedding"] for item in data]
	return np.array(embeddings)


	def text_from_file(text_path):
	in_text = ""
	with open(text_path, "r", encoding="utf-8") as text_file:
	for line in text_file:
	in_text += line
	return in_text


	def get_chunks(timestamped_transcripts, chunk_lines):
	chunks = []
	current_chunk = []
	for line in timestamped_transcripts:
	current_chunk.append(line)
	if len(current_chunk) == chunk_lines:
	chunks.append("\n".join(current_chunk))
	current_chunk = []

	if len(current_chunk) > 0:
	chunks.append("\n".join(current_chunk))

	return chunks


	def align_chapters(timestamped_transcript, yt_chapters):
	timestamped_transcripts = timestamped_transcript.strip().split("\n")

	chapters = []
	chapter_text = ""
	chapter_start_time = 0.0
	prev_end_time = 0.0
	chapter_index = 0
	for idx, trn in enumerate(timestamped_transcripts):
	trn_start_time = float(trn.split()[0])
	trn_end_time = float(trn.split()[1])
	trn_text = " ".join(trn.split()[2:])

	if idx == 0:
	chapter_start_time = trn_start_time

	next_index = min(chapter_index + 1, len(yt_chapters) - 1)
	if trn_start_time >= yt_chapters[next_index]["start_time"]:
	if len(chapters) == len(yt_chapters):
	chapter_text += f"{trn_text}\n"
	else:
	chapters.append(
	{
	"text": chapter_text,
	"start_time": chapter_start_time,
	"end_time": prev_end_time,
	"title": yt_chapters[chapter_index]["title"],
	}
	)
	chapter_text = trn_text
	chapter_start_time = trn_start_time
	chapter_index += 1
	else:
	chapter_text += f"{trn_text}\n"
	prev_end_time = trn_end_time

	if len(chapters) == len(yt_chapters):
	chapter_index = len(yt_chapters) - 1
	chapters[chapter_index]["text"] += chapter_text
	chapters[chapter_index]["end_time"] = prev_end_time
	return chapters


	def get_automatic_chapters(
	timestamped_transcript, openai_api_key, chunk_lines=5, num_clusters=3
	):
	timestamped_transcripts = [
	timestamped_line
	for timestamped_line in timestamped_transcript.split("\n")
	if len(timestamped_line.strip()) > 0
	]

	# Split into chunks
	text_chunks = get_chunks(timestamped_transcripts, chunk_lines)
	embeddings = get_embeddings(text_chunks, openai_api_key)

	# Creating and fitting the K-means model
	kmeans = KMeans(n_clusters=num_clusters)
	kmeans.fit(embeddings)

	# Getting the cluster labels
	cluster_labels = kmeans.labels_

	current_label = -1
	current_text = ""
	chapters = []
	for idx, (text_chunk, label) in enumerate(zip(text_chunks, cluster_labels)):
	start_time, end_time = get_chunk_timestamps(text_chunk)

	if idx == 0:
	chapter_start_time = start_time

	if label != current_label and current_label != -1:
	chapters.append(
	{
	"text": current_text,
	"start_time": chapter_start_time,
	"end_time": prev_end_time,
	"title": "",
	}
	)
	current_text = ""
	chapter_start_time = start_time

	current_label = label
	current_text += get_chunk_text(text_chunk)
	prev_end_time = end_time
	if len(current_text) > 0:
	chapters.append(
	{
	"text": current_text,
	"start_time": chapter_start_time,
	"end_time": prev_end_time,
	"title": "",
	}
	)
	return chapters


	def get_chunk_timestamps(chunk):
	start_time = float(chunk.strip().split("\n")[0].split()[0])
	end_time = float(chunk.strip().split("\n")[-1].split()[1])
	return start_time, end_time


	def get_chunk_text(chunk):
	chunk_text = ""
	for chunk_line in chunk.strip().split("\n"):
	chunk_text += " ".join(chunk_line.split()[2:])
	return chunk_text


	def summarize_chapters(chapters, openai_api_key):
	llm = OpenAI(temperature=0.9, openai_api_key=openai_api_key)
	chapter_docs = [Document(page_content=chapter["text"]) for chapter in chapters]

	summary_chain = load_summarize_chain(
	llm, chain_type="map_reduce", return_intermediate_steps=True
	)
	summaries = summary_chain(
	{"input_documents": chapter_docs}, return_only_outputs=True
	)

	summary_docs = [
	Document(page_content=summary) for summary in summaries["intermediate_steps"]
	]

	title_chain = load_summarize_chain(
	llm,
	chain_type="map_reduce",
	return_intermediate_steps=True,
	map_prompt=TITLE_PROMPT,
	)
	titles = title_chain({"input_documents": summary_docs}, return_only_outputs=True)

	summarized_chapters = []
	for chapter, chapter_summary, chapter_title in zip(
	chapters, summaries["intermediate_steps"], titles["intermediate_steps"]
	):
	if len(chapter["title"]) > 0:
	chapter_title = chapter["title"]
	summarized_chapters.append(
	{
	"start": chapter["start_time"],
	"end": chapter["end_time"],
	"text": chapter["text"],
	"title": chapter_title.strip(),
	"summary": chapter_summary.strip(),
	}
	)
	return summarized_chapters, summaries["output_text"]