from langchain_community.document_loaders import ( PyPDFLoader, UnstructuredWordDocumentLoader, YoutubeLoader ) from langchain_community.document_loaders.generic import GenericLoader from langchain_community.document_loaders.parsers.audio import OpenAIWhisperParser from langchain.text_splitter import RecursiveCharacterTextSplitter from youtube_transcript_api import YouTubeTranscriptApi import re class ContentProcessor: def __init__(self): self.text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200 ) def process_pdf(self, file_path): loader = PyPDFLoader(file_path) pages = loader.load_and_split(self.text_splitter) return pages def process_docx(self, file_path): loader = UnstructuredWordDocumentLoader(file_path) pages = loader.load_and_split(self.text_splitter) return pages def process_youtube(self, video_url): # Extract video ID from URL video_id = self._extract_video_id(video_url) if not video_id: raise ValueError("Invalid YouTube URL") try: # Get transcript directly using youtube_transcript_api transcript_list = YouTubeTranscriptApi.get_transcript(video_id) # Combine all transcript pieces full_transcript = " ".join([entry['text'] for entry in transcript_list]) # Create a document-like structure from langchain.schema import Document doc = Document( page_content=full_transcript, metadata={"source": video_url} ) # Split the document return self.text_splitter.split_documents([doc]) except Exception as e: raise Exception(f"Error getting transcript: {str(e)}") def _extract_video_id(self, url): # Handle different YouTube URL formats patterns = [ r'(?:youtube\.com\/watch\?v=|youtu.be\/|youtube.com\/embed\/)([^&\n?]*)', r'(?:youtube\.com\/shorts\/)([^&\n?]*)' ] for pattern in patterns: match = re.search(pattern, url) if match: return match.group(1) return None def process_audio(self, audio_file): loader = GenericLoader( audio_file, parser=OpenAIWhisperParser() ) transcript = loader.load() return self.text_splitter.split_documents(transcript)