Spaces:
Running
Running
File size: 4,613 Bytes
9150552 3456a58 9150552 3456a58 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
# divide the subs into chunks for more accurate summarization
# TODO: divide the subs into chunks based on the topics
# summarize each chunk and add it to the markdown file
from rich.progress import track
class legacy_chunker:
# legacy manual chunker
def __init__(self, text):
self.text = text
def chunker(self, size=1000):
words = self.text.split()
chunks = []
current_chunk = ""
for word in words:
if len(current_chunk) + len(word) + 1 <= size:
current_chunk += f"{word} "
else:
chunks.append(current_chunk.strip())
current_chunk = f"{word} "
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
def __sizeof__(self) -> int:
count = 0
for _ in self.text:
count += 1
return count
class LangChainChunker:
def __init__(self, text):
self.text = text
def chunker(self, size=1000):
from langchain.text_splitter import CharacterTextSplitter
# attach the duration of the video to the chunk
# [[chunk, duration]]
text_splitter = CharacterTextSplitter(
separator=" ",
chunk_size=size,
chunk_overlap=0.9,
)
return text_splitter.split_text(self.text)
def __sizeof__(self) -> int:
count = 0
for _ in self.text:
count += 1
return count
def ChunkByChapters(chapters: list, subs: list, size=1000):
"""Chunk the youtube video subtitles based on the chapters
Args:
chapters (list): Chapters from yt api
subs (list): subtitles from yt api
size (int, optional): _description_. Defaults to 1000.
Raises:
Exception: No chapters found
Returns:
list : structure chunk_dict = {
"chapter1": [
[chunk1, chunk2, chunk3, ...],
[chunk1_duration, chunk2_duration, chunk3_duration, ...]
],
...
}
"""
chunks = []
chunk_dict = {}
# format chapters for chunking
Fchapters = [[chapter['title'], chapter['time']] for chapter in chapters]
if len(chapters) == 0:
raise Exception("No chapters found")
else:
## STEP 1:
# chapters timestamp is set to beggining of chapter
# to process all chapter subs instead of always checking if the sub is in the chapter
# its easier to set the timestamp to end of chapter
# set timestamp to last second of chapter
for c in range(len(Fchapters)-1):
if c == len(Fchapters):
break
Fchapters[c][1] = Fchapters[c+1][1] - 1
## STEP 2: chunking based on chapters
# for each chapter, chunk the subs
# and add the chunk to the chunk_dict
#
# chunk_dict = {
# "chapter1": [
# [chunk1, chunk2, chunk3, ...],
# [chunk1_duration, chunk2_duration, chunk3_duration, ...]
# ],
# ...
# }
#
for c in track(
range(len(Fchapters)-1),
description="Chunking by chapters: "
):
title = Fchapters[c][0]
# set the start and end of the chapter
start = 0 if c == 0 else Fchapters[c-1][1]+1
end = Fchapters[c][1]
current_chunk = ""
## STEP 2 (a): process the subs
# for each sub, check if it is in the chapter
# if it is, add it to the current chunk
for sublinedata in subs:
cstart: int = sublinedata['start']
subline: str = sublinedata['text']
if cstart < start:
continue
if cstart >= end:
break
total_size = len(current_chunk) + len(subline)
if total_size + 1 < size:
current_chunk += subline
else:
chunks.append(
[
[current_chunk.strip()],
[cstart],
]
)
current_chunk = ""
chunk_dict.update({title: chunks})
chunks = []
return chunk_dict
|