File size: 6,668 Bytes
e3d3533
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd1ba34
 
 
 
e3d3533
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd1ba34
 
 
 
 
 
 
 
e3d3533
 
 
dd1ba34
e3d3533
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd1ba34
 
e3d3533
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2458b22
e3d3533
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
import os
import textwrap

import numpy as np
import openai
from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from sklearn.cluster import KMeans
from tenacity import stop_after_attempt  # for exponential backoff
from tenacity import retry, wait_random_exponential

DEFAULT_PROMPT = (
    "Summarize this Youtube video chapter. Always start with a topical sentence: "
)
CHAPTER_TITLE = "Give a title to this video chapter based on the transcript: "

title_template = "Give a title to this text summary: {text}"
TITLE_PROMPT = PromptTemplate(template=title_template, input_variables=["text"])

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def get_embeddings(text_chunks, openai_api_key, model="text-embedding-ada-002"):
    data = openai.Embedding.create(
        input=text_chunks, model=model, openai_api_key=openai_api_key
    )["data"]
    embeddings = [item["embedding"] for item in data]
    return np.array(embeddings)


def text_from_file(text_path):
    in_text = ""
    with open(text_path, "r", encoding="utf-8") as text_file:
        for line in text_file:
            in_text += line
    return in_text


def get_chunks(timestamped_transcripts, chunk_lines):
    chunks = []
    current_chunk = []
    for line in timestamped_transcripts:
        current_chunk.append(line)
        if len(current_chunk) == chunk_lines:
            chunks.append("\n".join(current_chunk))
            current_chunk = []

    if len(current_chunk) > 0:
        chunks.append("\n".join(current_chunk))

    return chunks


def align_chapters(timestamped_transcript, yt_chapters):
    timestamped_transcripts = timestamped_transcript.strip().split("\n")

    chapters = []
    chapter_text = ""
    chapter_start_time = 0.0
    prev_end_time = 0.0
    chapter_index = 0
    for idx, trn in enumerate(timestamped_transcripts):
        trn_start_time = float(trn.split()[0])
        trn_end_time = float(trn.split()[1])
        trn_text = " ".join(trn.split()[2:])

        if idx == 0:
            chapter_start_time = trn_start_time

        next_index = min(chapter_index + 1, len(yt_chapters) - 1)
        if trn_start_time >= yt_chapters[next_index]["start_time"]:
            if len(chapters) == len(yt_chapters):
                chapter_text += f"{trn_text}\n"
            else:
                chapters.append(
                    {
                        "text": chapter_text,
                        "start_time": chapter_start_time,
                        "end_time": prev_end_time,
                        "title": yt_chapters[chapter_index]["title"],
                    }
                )
                chapter_text = trn_text
                chapter_start_time = trn_start_time
                chapter_index += 1
        else:
            chapter_text += f"{trn_text}\n"
        prev_end_time = trn_end_time

    if len(chapters) == len(yt_chapters):
        chapter_index = len(yt_chapters) - 1
        chapters[chapter_index]["text"] += chapter_text
        chapters[chapter_index]["end_time"] = prev_end_time
    return chapters


def get_automatic_chapters(
    timestamped_transcript, openai_api_key, chunk_lines=5, num_clusters=3
):
    timestamped_transcripts = [
        timestamped_line
        for timestamped_line in timestamped_transcript.split("\n")
        if len(timestamped_line.strip()) > 0
    ]

    # Split into chunks
    text_chunks = get_chunks(timestamped_transcripts, chunk_lines)
    embeddings = get_embeddings(text_chunks, openai_api_key)

    # Creating and fitting the K-means model
    kmeans = KMeans(n_clusters=num_clusters)
    kmeans.fit(embeddings)

    # Getting the cluster labels
    cluster_labels = kmeans.labels_

    current_label = -1
    current_text = ""
    chapters = []
    for idx, (text_chunk, label) in enumerate(zip(text_chunks, cluster_labels)):
        start_time, end_time = get_chunk_timestamps(text_chunk)

        if idx == 0:
            chapter_start_time = start_time

        if label != current_label and current_label != -1:
            chapters.append(
                {
                    "text": current_text,
                    "start_time": chapter_start_time,
                    "end_time": prev_end_time,
                    "title": "",
                }
            )
            current_text = ""
            chapter_start_time = start_time

        current_label = label
        current_text += get_chunk_text(text_chunk)
        prev_end_time = end_time
    if len(current_text) > 0:
        chapters.append(
            {
                "text": current_text,
                "start_time": chapter_start_time,
                "end_time": prev_end_time,
                "title": "",
            }
        )
    return chapters


def get_chunk_timestamps(chunk):
    start_time = float(chunk.strip().split("\n")[0].split()[0])
    end_time = float(chunk.strip().split("\n")[-1].split()[1])
    return start_time, end_time


def get_chunk_text(chunk):
    chunk_text = ""
    for chunk_line in chunk.strip().split("\n"):
        chunk_text += " ".join(chunk_line.split()[2:])
    return chunk_text


def summarize_chapters(chapters, openai_api_key):
    llm = OpenAI(temperature=0.9, openai_api_key=openai_api_key)
    chapter_docs = [Document(page_content=chapter["text"]) for chapter in chapters]

    summary_chain = load_summarize_chain(
        llm, chain_type="map_reduce", return_intermediate_steps=True
    )
    summaries = summary_chain(
        {"input_documents": chapter_docs}, return_only_outputs=True
    )

    summary_docs = [
        Document(page_content=summary) for summary in summaries["intermediate_steps"]
    ]

    title_chain = load_summarize_chain(
        llm,
        chain_type="map_reduce",
        return_intermediate_steps=True,
        map_prompt=TITLE_PROMPT,
    )
    titles = title_chain({"input_documents": summary_docs}, return_only_outputs=True)

    summarized_chapters = []
    for chapter, chapter_summary, chapter_title in zip(
        chapters, summaries["intermediate_steps"], titles["intermediate_steps"]
    ):
        if len(chapter["title"]) > 0:
            chapter_title = chapter["title"]
        summarized_chapters.append(
            {
                "start": chapter["start_time"],
                "end": chapter["end_time"],
                "text": chapter["text"],
                "title": chapter_title.strip(),
                "summary": chapter_summary.strip(),
            }
        )
    return summarized_chapters, summaries["output_text"]