Spaces:
Sleeping
Sleeping
File size: 6,668 Bytes
e3d3533 dd1ba34 e3d3533 dd1ba34 e3d3533 dd1ba34 e3d3533 dd1ba34 e3d3533 2458b22 e3d3533 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 |
import os
import textwrap
import numpy as np
import openai
from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from sklearn.cluster import KMeans
from tenacity import stop_after_attempt # for exponential backoff
from tenacity import retry, wait_random_exponential
DEFAULT_PROMPT = (
"Summarize this Youtube video chapter. Always start with a topical sentence: "
)
CHAPTER_TITLE = "Give a title to this video chapter based on the transcript: "
title_template = "Give a title to this text summary: {text}"
TITLE_PROMPT = PromptTemplate(template=title_template, input_variables=["text"])
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def get_embeddings(text_chunks, openai_api_key, model="text-embedding-ada-002"):
data = openai.Embedding.create(
input=text_chunks, model=model, openai_api_key=openai_api_key
)["data"]
embeddings = [item["embedding"] for item in data]
return np.array(embeddings)
def text_from_file(text_path):
in_text = ""
with open(text_path, "r", encoding="utf-8") as text_file:
for line in text_file:
in_text += line
return in_text
def get_chunks(timestamped_transcripts, chunk_lines):
chunks = []
current_chunk = []
for line in timestamped_transcripts:
current_chunk.append(line)
if len(current_chunk) == chunk_lines:
chunks.append("\n".join(current_chunk))
current_chunk = []
if len(current_chunk) > 0:
chunks.append("\n".join(current_chunk))
return chunks
def align_chapters(timestamped_transcript, yt_chapters):
timestamped_transcripts = timestamped_transcript.strip().split("\n")
chapters = []
chapter_text = ""
chapter_start_time = 0.0
prev_end_time = 0.0
chapter_index = 0
for idx, trn in enumerate(timestamped_transcripts):
trn_start_time = float(trn.split()[0])
trn_end_time = float(trn.split()[1])
trn_text = " ".join(trn.split()[2:])
if idx == 0:
chapter_start_time = trn_start_time
next_index = min(chapter_index + 1, len(yt_chapters) - 1)
if trn_start_time >= yt_chapters[next_index]["start_time"]:
if len(chapters) == len(yt_chapters):
chapter_text += f"{trn_text}\n"
else:
chapters.append(
{
"text": chapter_text,
"start_time": chapter_start_time,
"end_time": prev_end_time,
"title": yt_chapters[chapter_index]["title"],
}
)
chapter_text = trn_text
chapter_start_time = trn_start_time
chapter_index += 1
else:
chapter_text += f"{trn_text}\n"
prev_end_time = trn_end_time
if len(chapters) == len(yt_chapters):
chapter_index = len(yt_chapters) - 1
chapters[chapter_index]["text"] += chapter_text
chapters[chapter_index]["end_time"] = prev_end_time
return chapters
def get_automatic_chapters(
timestamped_transcript, openai_api_key, chunk_lines=5, num_clusters=3
):
timestamped_transcripts = [
timestamped_line
for timestamped_line in timestamped_transcript.split("\n")
if len(timestamped_line.strip()) > 0
]
# Split into chunks
text_chunks = get_chunks(timestamped_transcripts, chunk_lines)
embeddings = get_embeddings(text_chunks, openai_api_key)
# Creating and fitting the K-means model
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(embeddings)
# Getting the cluster labels
cluster_labels = kmeans.labels_
current_label = -1
current_text = ""
chapters = []
for idx, (text_chunk, label) in enumerate(zip(text_chunks, cluster_labels)):
start_time, end_time = get_chunk_timestamps(text_chunk)
if idx == 0:
chapter_start_time = start_time
if label != current_label and current_label != -1:
chapters.append(
{
"text": current_text,
"start_time": chapter_start_time,
"end_time": prev_end_time,
"title": "",
}
)
current_text = ""
chapter_start_time = start_time
current_label = label
current_text += get_chunk_text(text_chunk)
prev_end_time = end_time
if len(current_text) > 0:
chapters.append(
{
"text": current_text,
"start_time": chapter_start_time,
"end_time": prev_end_time,
"title": "",
}
)
return chapters
def get_chunk_timestamps(chunk):
start_time = float(chunk.strip().split("\n")[0].split()[0])
end_time = float(chunk.strip().split("\n")[-1].split()[1])
return start_time, end_time
def get_chunk_text(chunk):
chunk_text = ""
for chunk_line in chunk.strip().split("\n"):
chunk_text += " ".join(chunk_line.split()[2:])
return chunk_text
def summarize_chapters(chapters, openai_api_key):
llm = OpenAI(temperature=0.9, openai_api_key=openai_api_key)
chapter_docs = [Document(page_content=chapter["text"]) for chapter in chapters]
summary_chain = load_summarize_chain(
llm, chain_type="map_reduce", return_intermediate_steps=True
)
summaries = summary_chain(
{"input_documents": chapter_docs}, return_only_outputs=True
)
summary_docs = [
Document(page_content=summary) for summary in summaries["intermediate_steps"]
]
title_chain = load_summarize_chain(
llm,
chain_type="map_reduce",
return_intermediate_steps=True,
map_prompt=TITLE_PROMPT,
)
titles = title_chain({"input_documents": summary_docs}, return_only_outputs=True)
summarized_chapters = []
for chapter, chapter_summary, chapter_title in zip(
chapters, summaries["intermediate_steps"], titles["intermediate_steps"]
):
if len(chapter["title"]) > 0:
chapter_title = chapter["title"]
summarized_chapters.append(
{
"start": chapter["start_time"],
"end": chapter["end_time"],
"text": chapter["text"],
"title": chapter_title.strip(),
"summary": chapter_summary.strip(),
}
)
return summarized_chapters, summaries["output_text"]
|