Spaces:
Sleeping
Sleeping
File size: 3,646 Bytes
c347d26 29ce305 c347d26 c17054a c347d26 c17054a c347d26 c17054a c347d26 c17054a c347d26 c17054a c347d26 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
from langchain_community.document_loaders import (
PyPDFLoader,
UnstructuredWordDocumentLoader,
YoutubeLoader
)
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers.audio import OpenAIWhisperParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from youtube_transcript_api import (
YouTubeTranscriptApi,
TranscriptsDisabled,
NoTranscriptFound,
NoTranscriptAvailable
)
import re
import logging
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class ContentProcessor:
def __init__(self):
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200
)
def process_pdf(self, file_path):
loader = PyPDFLoader(file_path)
pages = loader.load_and_split(self.text_splitter)
return pages
def process_docx(self, file_path):
loader = UnstructuredWordDocumentLoader(file_path)
pages = loader.load_and_split(self.text_splitter)
return pages
def process_youtube(self, video_url):
try:
# Log the incoming URL
logger.info(f"Processing YouTube URL: {video_url}")
video_id = self._extract_video_id(video_url)
if not video_id:
logger.error(f"Invalid YouTube URL: {video_url}")
raise ValueError("This appears to be an invalid YouTube URL. Please check the URL and try again.")
# Log the extracted video ID
logger.info(f"Extracted video ID: {video_id}")
# List available transcripts
try:
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
logger.info(f"Available transcripts: {transcript_list}")
except Exception as e:
logger.error(f"Error listing transcripts: {str(e)}")
# Try to get the transcript
try:
transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
full_transcript = " ".join([entry['text'] for entry in transcript_list])
# Create a document-like structure
from langchain.schema import Document
doc = Document(
page_content=full_transcript,
metadata={"source": video_url}
)
return self.text_splitter.split_documents([doc])
except Exception as e:
logger.error(f"Error getting transcript: {str(e)}")
raise Exception(f"Unable to access video transcript. Error: {str(e)}\nPlease try a video with available captions.")
except Exception as e:
logger.error(f"Process failed: {str(e)}")
raise
def _extract_video_id(self, url):
# Handle different YouTube URL formats
patterns = [
r'(?:youtube\.com\/watch\?v=|youtu.be\/|youtube.com\/embed\/)([^&\n?]*)',
r'(?:youtube\.com\/shorts\/)([^&\n?]*)'
]
for pattern in patterns:
match = re.search(pattern, url)
if match:
return match.group(1)
return None
def process_audio(self, audio_file):
loader = GenericLoader(
audio_file,
parser=OpenAIWhisperParser()
)
transcript = loader.load()
return self.text_splitter.split_documents(transcript) |