deepaksarika01's picture
Upload 5 files
7cf68b3
class LangChainChunker:
def __init__(self, text):
self.text = text
def chunker(self, size=1000):
from langchain.text_splitter import CharacterTextSplitter
# attach the duration of the video to the chunk
# [[chunk, duration]]
text_splitter = CharacterTextSplitter(
separator=" ",
chunk_size=size,
chunk_overlap=0.9,
)
return text_splitter.split_text(self.text)
def __sizeof__(self) -> int:
count = 0
for _ in self.text:
count += 1
return count
def getSubsText(video_id="", getGenerated=False):
from youtube_transcript_api import YouTubeTranscriptApi as ytapi
from youtube_transcript_api.formatters import TextFormatter
tList = ytapi.list_transcripts(video_id)
data = ""
if getGenerated:
# TODO: implement getGenerated
pass
for t in tList:
data = t.fetch()
return (TextFormatter().format_transcript(data)).replace("\n", " ")