Spaces:
Sleeping
Sleeping
from langchain_community.document_loaders import ( | |
PyPDFLoader, | |
UnstructuredWordDocumentLoader, | |
YoutubeLoader | |
) | |
from langchain_community.document_loaders.generic import GenericLoader | |
from langchain_community.document_loaders.parsers.audio import OpenAIWhisperParser | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from youtube_transcript_api import YouTubeTranscriptApi | |
import re | |
class ContentProcessor: | |
def __init__(self): | |
self.text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=1000, | |
chunk_overlap=200 | |
) | |
def process_pdf(self, file_path): | |
loader = PyPDFLoader(file_path) | |
pages = loader.load_and_split(self.text_splitter) | |
return pages | |
def process_docx(self, file_path): | |
loader = UnstructuredWordDocumentLoader(file_path) | |
pages = loader.load_and_split(self.text_splitter) | |
return pages | |
def process_youtube(self, video_url): | |
video_id = self._extract_video_id(video_url) | |
if not video_id: | |
raise ValueError("This appears to be an invalid YouTube URL. Please check the URL and try again.") | |
try: | |
transcript_list = YouTubeTranscriptApi.get_transcript(video_id) | |
full_transcript = " ".join([entry['text'] for entry in transcript_list]) | |
# Create a document-like structure | |
from langchain.schema import Document | |
doc = Document( | |
page_content=full_transcript, | |
metadata={"source": video_url} | |
) | |
return self.text_splitter.split_documents([doc]) | |
except TranscriptsDisabled: | |
raise Exception("This video does not have subtitles/captions enabled. Please try a different video that has captions available.") | |
except Exception as e: | |
raise Exception(f"Unable to get transcript: {str(e)}. Please ensure the video has captions enabled.") | |
def _extract_video_id(self, url): | |
# Handle different YouTube URL formats | |
patterns = [ | |
r'(?:youtube\.com\/watch\?v=|youtu.be\/|youtube.com\/embed\/)([^&\n?]*)', | |
r'(?:youtube\.com\/shorts\/)([^&\n?]*)' | |
] | |
for pattern in patterns: | |
match = re.search(pattern, url) | |
if match: | |
return match.group(1) | |
return None | |
def process_audio(self, audio_file): | |
loader = GenericLoader( | |
audio_file, | |
parser=OpenAIWhisperParser() | |
) | |
transcript = loader.load() | |
return self.text_splitter.split_documents(transcript) |