File size: 4,613 Bytes
9150552
 
 
3456a58
9150552
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3456a58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# divide the subs into chunks for more accurate summarization
# TODO: divide the subs into chunks based on the topics
# summarize each chunk and add it to the markdown file
from rich.progress import track

class legacy_chunker:
    # legacy manual chunker
    def __init__(self, text):
        self.text = text
    def chunker(self, size=1000):
        words = self.text.split()
        chunks = []
        current_chunk = ""
        for word in words:
            if len(current_chunk) + len(word) + 1 <= size:
                current_chunk += f"{word} "
            else:
                chunks.append(current_chunk.strip())
                current_chunk = f"{word} "
        if current_chunk:
            chunks.append(current_chunk.strip())
        return chunks

    
    def __sizeof__(self) -> int:
        count = 0
        for _ in self.text:
            count += 1
        return count
            
class LangChainChunker:
    def __init__(self, text):
        self.text = text
    
    def chunker(self, size=1000):
        from langchain.text_splitter import CharacterTextSplitter
        
        # attach the duration of the video to the chunk
        # [[chunk, duration]]
        
        text_splitter = CharacterTextSplitter(
            separator=" ",
            chunk_size=size,
            chunk_overlap=0.9,
        )
        
        return text_splitter.split_text(self.text)
    
    def __sizeof__(self) -> int:
        count = 0
        for _ in self.text:
            count += 1
        return count

def ChunkByChapters(chapters: list, subs: list, size=1000):
    """Chunk the youtube video  subtitles based on the chapters

    Args:
        chapters (list): Chapters from yt api
        subs (list): subtitles from yt api
        size (int, optional): _description_. Defaults to 1000.

    Raises:
        Exception: No chapters found

    Returns:
        list : structure chunk_dict = {
              "chapter1": [
                  [chunk1, chunk2, chunk3, ...],
                  [chunk1_duration, chunk2_duration, chunk3_duration, ...]
              ],
              ...
          }
    """
    chunks = []
    chunk_dict = {}
    
    # format chapters for chunking
    Fchapters = [[chapter['title'], chapter['time']] for chapter in chapters]
    
    if len(chapters) == 0:
        raise Exception("No chapters found")
    else:
        
        ## STEP 1: 
        # chapters timestamp is set to beggining of chapter
        # to process all chapter subs instead of always checking if the sub is in the chapter
        # its easier to set the timestamp to end of chapter
        # set timestamp to last second of chapter
        for c in range(len(Fchapters)-1):
            if c == len(Fchapters):
                break
            Fchapters[c][1] = Fchapters[c+1][1] - 1
        
        
        ## STEP 2: chunking based on chapters
        # for each chapter, chunk the subs
        # and add the chunk to the chunk_dict
        #
        #   chunk_dict = {
        #       "chapter1": [
        #           [chunk1, chunk2, chunk3, ...],
        #           [chunk1_duration, chunk2_duration, chunk3_duration, ...]
        #       ],
        #       ...
        #   }
        #
        
        for c in track(
            range(len(Fchapters)-1),
            description="Chunking by chapters: "
        ):
            title   = Fchapters[c][0]
            
            # set the start and end of the chapter
            start   = 0 if c == 0 else Fchapters[c-1][1]+1
            end     = Fchapters[c][1]
            
            current_chunk = ""
            
            ## STEP 2 (a): process the subs
            # for each sub, check if it is in the chapter
            # if it is, add it to the current chunk
            
            for sublinedata in subs:
                cstart: int = sublinedata['start']
                subline: str = sublinedata['text']
                
                if cstart < start:
                    continue
                if cstart >= end:
                    break
                
                total_size = len(current_chunk) + len(subline)
                if total_size + 1 < size:
                    current_chunk += subline
                else:
                    chunks.append(
                        [
                            [current_chunk.strip()],
                            [cstart],
                        ]
                    )
                    current_chunk = ""
            
            chunk_dict.update({title: chunks})
            chunks = []
            
    return chunk_dict