Spaces:
Running
Running
# divide the subs into chunks for more accurate summarization | |
# TODO: divide the subs into chunks based on the topics | |
# summarize each chunk and add it to the markdown file | |
from rich.progress import track | |
class legacy_chunker: | |
# legacy manual chunker | |
def __init__(self, text): | |
self.text = text | |
def chunker(self, size=1000): | |
words = self.text.split() | |
chunks = [] | |
current_chunk = "" | |
for word in words: | |
if len(current_chunk) + len(word) + 1 <= size: | |
current_chunk += f"{word} " | |
else: | |
chunks.append(current_chunk.strip()) | |
current_chunk = f"{word} " | |
if current_chunk: | |
chunks.append(current_chunk.strip()) | |
return chunks | |
def __sizeof__(self) -> int: | |
count = 0 | |
for _ in self.text: | |
count += 1 | |
return count | |
class LangChainChunker: | |
def __init__(self, text): | |
self.text = text | |
def chunker(self, size=1000): | |
from langchain.text_splitter import CharacterTextSplitter | |
# attach the duration of the video to the chunk | |
# [[chunk, duration]] | |
text_splitter = CharacterTextSplitter( | |
separator=" ", | |
chunk_size=size, | |
chunk_overlap=0.9, | |
) | |
return text_splitter.split_text(self.text) | |
def __sizeof__(self) -> int: | |
count = 0 | |
for _ in self.text: | |
count += 1 | |
return count | |
def ChunkByChapters(chapters: list, subs: list, size=1000): | |
"""Chunk the youtube video subtitles based on the chapters | |
Args: | |
chapters (list): Chapters from yt api | |
subs (list): subtitles from yt api | |
size (int, optional): _description_. Defaults to 1000. | |
Raises: | |
Exception: No chapters found | |
Returns: | |
list : structure chunk_dict = { | |
"chapter1": [ | |
[chunk1, chunk2, chunk3, ...], | |
[chunk1_duration, chunk2_duration, chunk3_duration, ...] | |
], | |
... | |
} | |
""" | |
chunks = [] | |
chunk_dict = {} | |
# format chapters for chunking | |
Fchapters = [[chapter['title'], chapter['time']] for chapter in chapters] | |
if len(chapters) == 0: | |
raise Exception("No chapters found") | |
else: | |
## STEP 1: | |
# chapters timestamp is set to beggining of chapter | |
# to process all chapter subs instead of always checking if the sub is in the chapter | |
# its easier to set the timestamp to end of chapter | |
# set timestamp to last second of chapter | |
for c in range(len(Fchapters)-1): | |
if c == len(Fchapters): | |
break | |
Fchapters[c][1] = Fchapters[c+1][1] - 1 | |
## STEP 2: chunking based on chapters | |
# for each chapter, chunk the subs | |
# and add the chunk to the chunk_dict | |
# | |
# chunk_dict = { | |
# "chapter1": [ | |
# [chunk1, chunk2, chunk3, ...], | |
# [chunk1_duration, chunk2_duration, chunk3_duration, ...] | |
# ], | |
# ... | |
# } | |
# | |
for c in track( | |
range(len(Fchapters)-1), | |
description="Chunking by chapters: " | |
): | |
title = Fchapters[c][0] | |
# set the start and end of the chapter | |
start = 0 if c == 0 else Fchapters[c-1][1]+1 | |
end = Fchapters[c][1] | |
current_chunk = "" | |
## STEP 2 (a): process the subs | |
# for each sub, check if it is in the chapter | |
# if it is, add it to the current chunk | |
for sublinedata in subs: | |
cstart: int = sublinedata['start'] | |
subline: str = sublinedata['text'] | |
if cstart < start: | |
continue | |
if cstart >= end: | |
break | |
total_size = len(current_chunk) + len(subline) | |
if total_size + 1 < size: | |
current_chunk += subline | |
else: | |
chunks.append( | |
[ | |
[current_chunk.strip()], | |
[cstart], | |
] | |
) | |
current_chunk = "" | |
chunk_dict.update({title: chunks}) | |
chunks = [] | |
return chunk_dict | |