Spaces:

zman1x1
/

yt-video-to-summary

Running

File size: 4,613 Bytes

# divide the subs into chunks for more accurate summarization
# TODO: divide the subs into chunks based on the topics
# summarize each chunk and add it to the markdown file
from rich.progress import track

class legacy_chunker:
    # legacy manual chunker
    def __init__(self, text):
        self.text = text
    def chunker(self, size=1000):
        words = self.text.split()
        chunks = []
        current_chunk = ""
        for word in words:
            if len(current_chunk) + len(word) + 1 <= size:
                current_chunk += f"{word} "
            else:
                chunks.append(current_chunk.strip())
                current_chunk = f"{word} "
        if current_chunk:
            chunks.append(current_chunk.strip())
        return chunks

    
    def __sizeof__(self) -> int:
        count = 0
        for _ in self.text:
            count += 1
        return count
            
class LangChainChunker:
    def __init__(self, text):
        self.text = text
    
    def chunker(self, size=1000):
        from langchain.text_splitter import CharacterTextSplitter
        
        # attach the duration of the video to the chunk
        # [[chunk, duration]]
        
        text_splitter = CharacterTextSplitter(
            separator=" ",
            chunk_size=size,
            chunk_overlap=0.9,
        )
        
        return text_splitter.split_text(self.text)
    
    def __sizeof__(self) -> int:
        count = 0
        for _ in self.text:
            count += 1
        return count

def ChunkByChapters(chapters: list, subs: list, size=1000):
    """Chunk the youtube video  subtitles based on the chapters

    Args:
        chapters (list): Chapters from yt api
        subs (list): subtitles from yt api
        size (int, optional): _description_. Defaults to 1000.

    Raises:
        Exception: No chapters found

    Returns:
        list : structure chunk_dict = {
              "chapter1": [
                  [chunk1, chunk2, chunk3, ...],
                  [chunk1_duration, chunk2_duration, chunk3_duration, ...]
              ],
              ...
          }
    """
    chunks = []
    chunk_dict = {}
    
    # format chapters for chunking
    Fchapters = [[chapter['title'], chapter['time']] for chapter in chapters]
    
    if len(chapters) == 0:
        raise Exception("No chapters found")
    else:
        
        ## STEP 1: 
        # chapters timestamp is set to beggining of chapter
        # to process all chapter subs instead of always checking if the sub is in the chapter
        # its easier to set the timestamp to end of chapter
        # set timestamp to last second of chapter
        for c in range(len(Fchapters)-1):
            if c == len(Fchapters):
                break
            Fchapters[c][1] = Fchapters[c+1][1] - 1
        
        
        ## STEP 2: chunking based on chapters
        # for each chapter, chunk the subs
        # and add the chunk to the chunk_dict
        #
        #   chunk_dict = {
        #       "chapter1": [
        #           [chunk1, chunk2, chunk3, ...],
        #           [chunk1_duration, chunk2_duration, chunk3_duration, ...]
        #       ],
        #       ...
        #   }
        #
        
        for c in track(
            range(len(Fchapters)-1),
            description="Chunking by chapters: "
        ):
            title   = Fchapters[c][0]
            
            # set the start and end of the chapter
            start   = 0 if c == 0 else Fchapters[c-1][1]+1
            end     = Fchapters[c][1]
            
            current_chunk = ""
            
            ## STEP 2 (a): process the subs
            # for each sub, check if it is in the chapter
            # if it is, add it to the current chunk
            
            for sublinedata in subs:
                cstart: int = sublinedata['start']
                subline: str = sublinedata['text']
                
                if cstart < start:
                    continue
                if cstart >= end:
                    break
                
                total_size = len(current_chunk) + len(subline)
                if total_size + 1 < size:
                    current_chunk += subline
                else:
                    chunks.append(
                        [
                            [current_chunk.strip()],
                            [cstart],
                        ]
                    )
                    current_chunk = ""
            
            chunk_dict.update({title: chunks})
            chunks = []
            
    return chunk_dict