adriiita commited on
Commit
6135bb3
·
verified ·
1 Parent(s): 06681ea

Update processors/input_processor.py

Browse files
Files changed (1) hide show
  1. processors/input_processor.py +4 -7
processors/input_processor.py CHANGED
@@ -27,16 +27,12 @@ class ContentProcessor:
27
  return pages
28
 
29
  def process_youtube(self, video_url):
30
- # Extract video ID from URL
31
  video_id = self._extract_video_id(video_url)
32
  if not video_id:
33
- raise ValueError("Invalid YouTube URL")
34
 
35
  try:
36
- # Get transcript directly using youtube_transcript_api
37
  transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
38
-
39
- # Combine all transcript pieces
40
  full_transcript = " ".join([entry['text'] for entry in transcript_list])
41
 
42
  # Create a document-like structure
@@ -46,11 +42,12 @@ class ContentProcessor:
46
  metadata={"source": video_url}
47
  )
48
 
49
- # Split the document
50
  return self.text_splitter.split_documents([doc])
51
 
 
 
52
  except Exception as e:
53
- raise Exception(f"Error getting transcript: {str(e)}")
54
 
55
  def _extract_video_id(self, url):
56
  # Handle different YouTube URL formats
 
27
  return pages
28
 
29
  def process_youtube(self, video_url):
 
30
  video_id = self._extract_video_id(video_url)
31
  if not video_id:
32
+ raise ValueError("This appears to be an invalid YouTube URL. Please check the URL and try again.")
33
 
34
  try:
 
35
  transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
 
 
36
  full_transcript = " ".join([entry['text'] for entry in transcript_list])
37
 
38
  # Create a document-like structure
 
42
  metadata={"source": video_url}
43
  )
44
 
 
45
  return self.text_splitter.split_documents([doc])
46
 
47
+ except TranscriptsDisabled:
48
+ raise Exception("This video does not have subtitles/captions enabled. Please try a different video that has captions available.")
49
  except Exception as e:
50
+ raise Exception(f"Unable to get transcript: {str(e)}. Please ensure the video has captions enabled.")
51
 
52
  def _extract_video_id(self, url):
53
  # Handle different YouTube URL formats