zman1x1's picture
Upload 21 files
3456a58
raw
history blame
4.61 kB
# divide the subs into chunks for more accurate summarization
# TODO: divide the subs into chunks based on the topics
# summarize each chunk and add it to the markdown file
from rich.progress import track
class legacy_chunker:
# legacy manual chunker
def __init__(self, text):
self.text = text
def chunker(self, size=1000):
words = self.text.split()
chunks = []
current_chunk = ""
for word in words:
if len(current_chunk) + len(word) + 1 <= size:
current_chunk += f"{word} "
else:
chunks.append(current_chunk.strip())
current_chunk = f"{word} "
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
def __sizeof__(self) -> int:
count = 0
for _ in self.text:
count += 1
return count
class LangChainChunker:
def __init__(self, text):
self.text = text
def chunker(self, size=1000):
from langchain.text_splitter import CharacterTextSplitter
# attach the duration of the video to the chunk
# [[chunk, duration]]
text_splitter = CharacterTextSplitter(
separator=" ",
chunk_size=size,
chunk_overlap=0.9,
)
return text_splitter.split_text(self.text)
def __sizeof__(self) -> int:
count = 0
for _ in self.text:
count += 1
return count
def ChunkByChapters(chapters: list, subs: list, size=1000):
"""Chunk the youtube video subtitles based on the chapters
Args:
chapters (list): Chapters from yt api
subs (list): subtitles from yt api
size (int, optional): _description_. Defaults to 1000.
Raises:
Exception: No chapters found
Returns:
list : structure chunk_dict = {
"chapter1": [
[chunk1, chunk2, chunk3, ...],
[chunk1_duration, chunk2_duration, chunk3_duration, ...]
],
...
}
"""
chunks = []
chunk_dict = {}
# format chapters for chunking
Fchapters = [[chapter['title'], chapter['time']] for chapter in chapters]
if len(chapters) == 0:
raise Exception("No chapters found")
else:
## STEP 1:
# chapters timestamp is set to beggining of chapter
# to process all chapter subs instead of always checking if the sub is in the chapter
# its easier to set the timestamp to end of chapter
# set timestamp to last second of chapter
for c in range(len(Fchapters)-1):
if c == len(Fchapters):
break
Fchapters[c][1] = Fchapters[c+1][1] - 1
## STEP 2: chunking based on chapters
# for each chapter, chunk the subs
# and add the chunk to the chunk_dict
#
# chunk_dict = {
# "chapter1": [
# [chunk1, chunk2, chunk3, ...],
# [chunk1_duration, chunk2_duration, chunk3_duration, ...]
# ],
# ...
# }
#
for c in track(
range(len(Fchapters)-1),
description="Chunking by chapters: "
):
title = Fchapters[c][0]
# set the start and end of the chapter
start = 0 if c == 0 else Fchapters[c-1][1]+1
end = Fchapters[c][1]
current_chunk = ""
## STEP 2 (a): process the subs
# for each sub, check if it is in the chapter
# if it is, add it to the current chunk
for sublinedata in subs:
cstart: int = sublinedata['start']
subline: str = sublinedata['text']
if cstart < start:
continue
if cstart >= end:
break
total_size = len(current_chunk) + len(subline)
if total_size + 1 < size:
current_chunk += subline
else:
chunks.append(
[
[current_chunk.strip()],
[cstart],
]
)
current_chunk = ""
chunk_dict.update({title: chunks})
chunks = []
return chunk_dict