from langchain_community.document_loaders import ( PyPDFLoader, UnstructuredWordDocumentLoader, YoutubeLoader ) from langchain_community.document_loaders.generic import GenericLoader from langchain_community.document_loaders.parsers.audio import OpenAIWhisperParser from langchain.text_splitter import RecursiveCharacterTextSplitter from youtube_transcript_api import YouTubeTranscriptApi import re class ContentProcessor: def __init__(self): self.text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200 ) def process_pdf(self, file_path): loader = PyPDFLoader(file_path) pages = loader.load_and_split(self.text_splitter) return pages def process_docx(self, file_path): loader = UnstructuredWordDocumentLoader(file_path) pages = loader.load_and_split(self.text_splitter) return pages def process_youtube(self, video_url): video_id = self._extract_video_id(video_url) if not video_id: raise ValueError("This appears to be an invalid YouTube URL. Please check the URL and try again.") try: transcript_list = YouTubeTranscriptApi.get_transcript(video_id) full_transcript = " ".join([entry['text'] for entry in transcript_list]) # Create a document-like structure from langchain.schema import Document doc = Document( page_content=full_transcript, metadata={"source": video_url} ) return self.text_splitter.split_documents([doc]) except TranscriptsDisabled: raise Exception("This video does not have subtitles/captions enabled. Please try a different video that has captions available.") except Exception as e: raise Exception(f"Unable to get transcript: {str(e)}. Please ensure the video has captions enabled.") def _extract_video_id(self, url): # Handle different YouTube URL formats patterns = [ r'(?:youtube\.com\/watch\?v=|youtu.be\/|youtube.com\/embed\/)([^&\n?]*)', r'(?:youtube\.com\/shorts\/)([^&\n?]*)' ] for pattern in patterns: match = re.search(pattern, url) if match: return match.group(1) return None def process_audio(self, audio_file): loader = GenericLoader( audio_file, parser=OpenAIWhisperParser() ) transcript = loader.load() return self.text_splitter.split_documents(transcript)