ChillThrills commited on
Commit
19098d4
·
1 Parent(s): d4d544b

Implement code changes to enhance functionality and improve performance

Browse files
Files changed (1) hide show
  1. app.py +346 -282
app.py CHANGED
@@ -15,13 +15,18 @@ from abc import ABC, abstractmethod
15
  from concurrent.futures import ThreadPoolExecutor, as_completed
16
  from concurrent.futures import TimeoutError as FuturesTimeoutError
17
  from collections import defaultdict
 
18
 
19
  try:
20
  import google.generativeai as genai
21
- from google.generativeai.types import GenerationConfig
22
  except ImportError:
23
  genai = None
24
  GenerationConfig = None
 
 
 
 
25
  print("WARNING: google-generativeai library not found. Install with: pip install google-generativeai")
26
 
27
  try:
@@ -65,7 +70,7 @@ except ImportError:
65
  print("WARNING: librosa library not found. Audio processing may be impaired. Install with: pip install librosa")
66
 
67
  try:
68
- import openpyxl
69
  except ImportError:
70
  openpyxl = None
71
  print("WARNING: openpyxl library not found. .xlsx file processing might fail. Install with: pip install openpyxl")
@@ -103,28 +108,28 @@ GOOGLE_GEMINI_API_KEY = os.getenv("GOOGLE_GEMINI_API_KEY")
103
  TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
104
 
105
  AGENT_DEFAULT_TIMEOUT = 15
106
- MAX_CONTEXT_LENGTH_LLM = 30000
107
 
108
- MAX_FILE_SIZE = 5 * 1024 * 1024
109
  CSV_SAMPLE_ROWS = 10
110
- MAX_FILE_CONTEXT_LENGTH = 10000
111
 
112
  # Global instances for video analysis pipelines
113
  video_object_detector_pipeline: Optional[Any] = None
114
  video_vqa_pipeline: Optional[Any] = None # Changed from species_classifier to VQA
115
 
116
- VIDEO_ANALYSIS_DEVICE: int = -1
117
  VIDEO_ANALYSIS_OBJECT_MODEL = "facebook/detr-resnet-50"
118
  VIDEO_ANALYSIS_VQA_MODEL = "Salesforce/blip-vqa-capfilt-large" # Using a VQA model
119
 
120
- VIDEO_MAX_FRAMES_TO_PROCESS = 120
121
  VIDEO_CONFIDENCE_THRESHOLD_BIRD = 0.6
122
  VIDEO_VQA_MIN_ANSWER_LENGTH = 3 # Minimum length for a VQA answer to be considered a species
123
- VIDEO_VQA_CONFIDENCE_THRESHOLD = 0.3
124
 
125
  asr_pipeline_instance: Optional[Any] = None
126
- ASR_MODEL_NAME = "openai/whisper-tiny"
127
- ASR_PROCESSING_TIMEOUT_SECONDS = 1024
128
 
129
 
130
  DEFAULT_RAG_CONFIG = {
@@ -136,7 +141,7 @@ DEFAULT_RAG_CONFIG = {
136
  'tavily_api_key': TAVILY_API_KEY,
137
  'default_max_results': 3, 'retry_attempts': 2, 'retry_delay': 2,
138
  'google_timeout': 8, 'tavily_depth': "basic",
139
- 'max_query_length_tavily': 380
140
  },
141
  'processing': {
142
  'trusted_sources': {'wikipedia.org': 0.8, 'reuters.com': 0.75, 'apnews.com': 0.75},
@@ -165,7 +170,7 @@ def _get_video_object_detector():
165
  # Simplified device selection, consistent with FileProcessor's ASR
166
  device_id = 0 if torch.cuda.is_available() else -1
167
  if VIDEO_ANALYSIS_DEVICE == -1 : VIDEO_ANALYSIS_DEVICE = device_id # Set global if not user-overridden
168
-
169
  target_device = VIDEO_ANALYSIS_DEVICE if VIDEO_ANALYSIS_DEVICE != -1 else device_id
170
 
171
  video_object_detector_pipeline = hf_transformers_pipeline(
@@ -185,7 +190,7 @@ def _get_video_vqa_pipeline(): # Renamed and changed to load VQA
185
  try:
186
  device_id = 0 if torch.cuda.is_available() else -1
187
  if VIDEO_ANALYSIS_DEVICE == -1: VIDEO_ANALYSIS_DEVICE = device_id
188
-
189
  target_device = VIDEO_ANALYSIS_DEVICE if VIDEO_ANALYSIS_DEVICE != -1 else device_id
190
 
191
  video_vqa_pipeline = hf_transformers_pipeline(
@@ -205,7 +210,7 @@ class FileProcessor:
205
  global asr_pipeline_instance
206
  if asr_pipeline_instance is None and hf_transformers_pipeline and torch:
207
  try:
208
- device = -1
209
  asr_pipeline_instance = hf_transformers_pipeline(
210
  "automatic-speech-recognition",
211
  model=ASR_MODEL_NAME,
@@ -287,7 +292,7 @@ class FileProcessor:
287
  f"Columns: {', '.join(df.columns)}\nFirst {min(CSV_SAMPLE_ROWS, len(df))} sample rows:\n{df.head(CSV_SAMPLE_ROWS).to_markdown(index=False)}"
288
  )
289
  return FileProcessor._truncate_text(summary, filename, "CSV")
290
- except Exception as e:
291
  if "tabulate" in str(e).lower() and df is not None:
292
  gaia_logger.error(f"CSV to_markdown error for '{filename}' (missing tabulate): {e}", exc_info=False)
293
  try:
@@ -330,7 +335,7 @@ class FileProcessor:
330
  break
331
  except UnicodeDecodeError: continue
332
  if text is None: text = content.decode('utf-8', errors='ignore')
333
-
334
  summary = f"Text Document: '{filename}':\n{text}"
335
  return FileProcessor._truncate_text(summary, filename, "Text")
336
  except Exception as e:
@@ -341,13 +346,13 @@ class FileProcessor:
341
  gaia_logger.info(f"Processing Excel file: {filename}")
342
  if not openpyxl: return f"Error: Excel processing skipped for '{filename}', openpyxl library not available."
343
  xls = None
344
- df_list_for_fallback = []
345
  try:
346
  xls = pd.ExcelFile(io.BytesIO(content), engine='openpyxl')
347
  summary_parts = [f"Excel Document Summary: '{filename}'"]
348
  for sheet_name in xls.sheet_names:
349
  df = xls.parse(sheet_name)
350
- df_list_for_fallback.append((sheet_name, df))
351
  sheet_summary = (
352
  f"\n---\nSheet: '{sheet_name}' ({len(df)} rows, {len(df.columns)} columns):\n"
353
  f"Columns: {', '.join(df.columns)}\nFirst {min(CSV_SAMPLE_ROWS, len(df))} sample rows:\n{df.head(CSV_SAMPLE_ROWS).to_markdown(index=False)}"
@@ -358,19 +363,20 @@ class FileProcessor:
358
  break
359
  full_summary = "".join(summary_parts)
360
  return FileProcessor._truncate_text(full_summary, filename, "Excel")
361
- except Exception as e:
362
  if "tabulate" in str(e).lower():
363
  gaia_logger.error(f"Excel to_markdown error for '{filename}' (missing tabulate): {e}", exc_info=False)
364
  try:
365
  summary_parts_fallback = [f"Excel Document Summary: '{filename}'"]
366
- if not df_list_for_fallback and xls:
367
  for sheet_name in xls.sheet_names:
368
  df_list_for_fallback.append((sheet_name, xls.parse(sheet_name)))
369
- elif not xls and not df_list_for_fallback:
370
  temp_xls = pd.ExcelFile(io.BytesIO(content), engine='openpyxl')
371
  for sheet_name in temp_xls.sheet_names:
372
  df_list_for_fallback.append((sheet_name, temp_xls.parse(sheet_name)))
373
 
 
374
  for sheet_name_fb, df_fb in df_list_for_fallback:
375
  sheet_summary_fallback = (
376
  f"\n---\nSheet: '{sheet_name_fb}' ({len(df_fb)} rows, {len(df_fb.columns)} columns):\n"
@@ -400,7 +406,7 @@ class FileProcessor:
400
  page_text = page.extract_text()
401
  if page_text:
402
  text_content += page_text + "\n"
403
- if len(text_content) > MAX_FILE_CONTEXT_LENGTH * 1.2:
404
  break
405
  if not text_content:
406
  return f"PDF Document: '{filename}'. No text could be extracted or PDF is empty."
@@ -412,7 +418,7 @@ class FileProcessor:
412
  @staticmethod
413
  def _perform_asr_transcription(asr_pipeline_ref, audio_data_np, filename_for_log):
414
  gaia_logger.info(f"ASR: Starting transcription for {filename_for_log} in thread.")
415
-
416
  return asr_pipeline_ref(audio_data_np, chunk_length_s=30, return_timestamps=False, generate_kwargs={"language": "en"})
417
 
418
 
@@ -424,11 +430,11 @@ class FileProcessor:
424
  return f"Error: Audio processing skipped for '{filename}', ASR pipeline not available."
425
  if not librosa:
426
  return f"Error: Audio processing skipped for '{filename}', librosa library not available."
427
-
428
  try:
429
  with io.BytesIO(content) as audio_buffer:
430
  y, sr = librosa.load(audio_buffer, sr=16000, mono=True)
431
-
432
  duration_seconds = len(y) / sr
433
  gaia_logger.info(f"Audio file: {filename}, Duration: {duration_seconds:.2f} seconds. Timeout set to: {ASR_PROCESSING_TIMEOUT_SECONDS}s")
434
  start_time = time.time()
@@ -442,7 +448,7 @@ class FileProcessor:
442
  except FuturesTimeoutError:
443
  gaia_logger.warning(f"ASR transcription for '{filename}' timed out after {ASR_PROCESSING_TIMEOUT_SECONDS} seconds.")
444
  return f"Error: Audio transcription for '{filename}' timed out after {ASR_PROCESSING_TIMEOUT_SECONDS}s."
445
- except Exception as e_thread:
446
  gaia_logger.error(f"ASR transcription thread for '{filename}' failed: {e_thread}", exc_info=True)
447
  if "3000 mel input features" in str(e_thread) or "return_timestamps" in str(e_thread):
448
  return f"Error processing Audio file '{filename}': Transcription failed due to long-form audio issue (mel features/timestamps). Original error: {str(e_thread)}"
@@ -453,7 +459,7 @@ class FileProcessor:
453
 
454
  if not transcribed_text.strip():
455
  return f"Audio Document: '{filename}'. Transcription result was empty or ASR failed."
456
-
457
  summary = f"Audio Document (Transcription): '{filename}':\n{transcribed_text}"
458
  return FileProcessor._truncate_text(summary, filename, "Audio Transcription")
459
 
@@ -472,7 +478,7 @@ class FileProcessor:
472
  except Exception:
473
  return f"File with Unknown Content Type: '{filename}'. Content is likely binary and cannot be displayed as text."
474
 
475
- class CacheManager:
476
  def __init__(self, ttl: int = 300, max_size: int = 100, name: str = "Cache"):
477
  self.ttl = ttl; self.max_size = max_size
478
  self._cache: Dict[Any, Any] = {}; self._timestamps: Dict[Any, float] = {}
@@ -483,31 +489,31 @@ class CacheManager:
483
  try:
484
  self._access_order.remove(key); self._access_order.append(key)
485
  return copy.deepcopy(self._cache[key])
486
- except (ValueError, TypeError) as e:
487
  self.delete(key); return None
488
- elif key in self._cache:
489
  self.delete(key)
490
  return None
491
  def set(self, key: Any, value: Any):
492
- if key in self._cache: self.delete(key)
493
  while len(self._cache) >= self.max_size and self._access_order:
494
  old_key = self._access_order.pop(0)
495
- if old_key in self._cache:
496
  del self._cache[old_key]; del self._timestamps[old_key]
497
  try: self._cache[key] = copy.deepcopy(value)
498
- except TypeError: self._cache[key] = value
499
  self._timestamps[key] = time.time(); self._access_order.append(key)
500
  def delete(self, key: Any):
501
  if key in self._cache:
502
  try:
503
  del self._cache[key]; del self._timestamps[key]
504
  if key in self._access_order: self._access_order.remove(key)
505
- except (ValueError, KeyError): pass
506
  def clear(self): self._cache.clear();self._timestamps.clear();self._access_order.clear();gaia_logger.info(f"[{self.name}] Cleared.")
507
  def __len__(self): return len(self._cache)
508
  def __contains__(self, key): return key in self._cache and (time.time()-self._timestamps.get(key,0)<self.ttl)
509
 
510
- class SearchProvider(ABC):
511
  def __init__(self, config_dict: Dict):
512
  self.provider_config = config_dict.get('search', {})
513
  self._enabled = False
@@ -532,7 +538,7 @@ class SearchProvider(ABC):
532
  return self._perform_search(query, max_results)
533
  def available(self) -> bool: return self._enabled
534
 
535
- class GoogleProvider(SearchProvider):
536
  @property
537
  def provider_name(self) -> str: return "Google"
538
  def __init__(self, config_dict: Dict):
@@ -554,7 +560,7 @@ class GoogleProvider(SearchProvider):
554
  except requests.exceptions.RequestException as e: gaia_logger.warning(f"[{self.provider_name}] RequestEx: '{query[:70]}': {e}"); return None
555
  except Exception as e: gaia_logger.error(f"[{self.provider_name}] Error: '{query[:70]}': {e}", exc_info=True); return None
556
 
557
- class TavilyProvider(SearchProvider):
558
  @property
559
  def provider_name(self) -> str: return "Tavily"
560
  def __init__(self, config_dict: Dict):
@@ -579,7 +585,7 @@ class TavilyProvider(SearchProvider):
579
  return [{'href': h.get('url'), 'title': h.get('title',''), 'body': h.get('content','')} for h in hits]
580
  except Exception as e: gaia_logger.warning(f"[{self.provider_name}] Search fail: '{query[:70]}': {e}"); return None
581
 
582
- class DuckDuckGoProvider(SearchProvider):
583
  @property
584
  def provider_name(self) -> str: return "DuckDuckGo"
585
  def __init__(self, config_dict: Dict):
@@ -596,9 +602,9 @@ class DuckDuckGoProvider(SearchProvider):
596
  return [{'href': r.get('href'), 'title': r.get('title',''), 'body': r.get('body','')} for r in hits]
597
  except Exception as e: gaia_logger.warning(f"[{self.provider_name}] Search fail: '{query[:70]}': {e}"); return None
598
 
599
- class CompositeSearchClient:
600
  def __init__(self, config_dict: Dict):
601
- self.config = config_dict
602
  self._search_config = config_dict.get('search', {})
603
  self.providers = self._init_providers(config_dict)
604
  self.cache = CacheManager(
@@ -634,24 +640,24 @@ class CompositeSearchClient:
634
  results = prov.search(q, actual_r)
635
  if results is not None: self.cache.set(cache_key, results); return results
636
  if attempt < self._retry_att: time.sleep(self._retry_del)
637
- except Exception as e:
638
  if attempt < self._retry_att: time.sleep(self._retry_del)
639
  self.cache.set(cache_key, [])
640
  return []
641
 
642
- class GaiaQueryBuilder:
643
  def __init__(self, base_query: str, config_dict: Dict):
644
  self.base_query = base_query.strip()
645
  self.config = config_dict
646
  def get_queries(self) -> Dict[str, List[Tuple[str, str]]]:
647
  return {'primary': [(self.base_query, 'GENERAL')]} if self.base_query else {'primary': []}
648
 
649
- class ResultProcessor:
650
  def __init__(self, config_dict: Dict):
651
  self.proc_config = config_dict.get('processing', {})
652
  self.trusted_sources = self.proc_config.get('trusted_sources', {})
653
  self.seen_urls: Set[str] = set()
654
- self.date_pattern = DEFAULT_RAG_CONFIG['processing'].get('date_pattern', r'\b\d{4}\b')
655
  def process_batch(self, results: List[Dict], query_tag: str, initial_cat: str='GENERAL') -> List[Dict]:
656
  processed: List[Dict] = []
657
  if not results: return processed
@@ -675,7 +681,7 @@ class ResultProcessor:
675
  result['temporal_relevance'] = temporal_r
676
  result['combined_score'] = (source_q * 0.6 + temporal_r * 0.4)
677
 
678
- class ContentEnricher:
679
  def __init__(self, config_dict: Dict):
680
  self.enrich_config = config_dict.get('enrichment', {})
681
  self._enabled = self.enrich_config.get('enabled', False) and bool(BeautifulSoup)
@@ -717,7 +723,7 @@ class ContentEnricher:
717
  except Exception as e: result['enrichment_failed'] = type(e).__name__
718
  return result
719
 
720
- class GeneralRAGPipeline:
721
  def __init__(self, config_dict: Optional[Dict] = None):
722
  self.config = config_dict if config_dict is not None else DEFAULT_RAG_CONFIG
723
  self.search_client = CompositeSearchClient(self.config)
@@ -735,13 +741,13 @@ class GeneralRAGPipeline:
735
  max_r_pq = cfg_search.get('default_max_results', 3)
736
  cache_key = (q, max_r_pq, total_lim, enrich_en, enrich_cnt)
737
  if not force_refresh and (cached := self.pipeline_cache.get(cache_key)) is not None: return cached
738
- if force_refresh: self.search_client.cache.clear();
739
- if self.enricher: self.enricher.cache.clear()
740
  all_res, res_proc = [], ResultProcessor(self.config)
741
  staged_qs = GaiaQueryBuilder(q, self.config).get_queries()
742
  for stage, qs_in_stage in staged_qs.items():
743
  for query_s, cat in qs_in_stage:
744
- if len(all_res) >= total_lim * 2: break
745
  s_res = self.search_client.search(query_s, max_results=max_r_pq, force_refresh=force_refresh)
746
  all_res.extend(res_proc.process_batch(s_res or [], query_s, initial_cat=cat))
747
  all_res.sort(key=lambda x: x.get('combined_score', 0), reverse=True)
@@ -758,36 +764,46 @@ class GaiaLevel1Agent:
758
  self.api_url = api_url
759
  self.llm_model: Optional[Any] = None
760
  self.rag_pipeline = GeneralRAGPipeline(DEFAULT_RAG_CONFIG)
761
-
762
  if genai and GOOGLE_GEMINI_API_KEY:
763
  try:
764
  genai.configure(api_key=GOOGLE_GEMINI_API_KEY)
765
  model_name = 'gemini-2.5-flash-preview-05-20'
 
 
766
  self.llm_model = genai.GenerativeModel(model_name)
767
  gaia_logger.info(f"Gemini LLM ('{model_name}') initialized.")
768
  except Exception as e:
769
  gaia_logger.error(f"Error initializing Gemini LLM: {e}", exc_info=True)
 
 
 
 
 
 
 
 
770
  else:
771
  gaia_logger.warning("Gemini LLM dependencies or API key missing.")
772
-
773
  if not self.llm_model:
774
  gaia_logger.warning("LLM (Gemini) unavailable. Limited capabilities.")
775
-
776
  _get_video_object_detector()
777
- _get_video_vqa_pipeline()
778
 
779
  gaia_logger.info(f"GaiaLevel1Agent (RAG, FileProcessor, VideoAnalysis) initialized. API: {self.api_url}")
780
 
781
  @lru_cache(maxsize=32)
782
  def _fetch_and_process_file_content(self, task_id: str) -> Optional[str]:
783
-
784
  file_url = f"{self.api_url}/files/{task_id}"
785
- for attempt in range(2):
786
  try:
787
  response = requests.get(file_url, timeout=AGENT_DEFAULT_TIMEOUT)
788
  response.raise_for_status()
789
-
790
- filename = FileProcessor._get_filename_from_url(response.url)
791
  content_disposition = response.headers.get('Content-Disposition')
792
  if content_disposition:
793
  header_filename = FileProcessor._get_filename_from_url(content_disposition)
@@ -800,7 +816,7 @@ class GaiaLevel1Agent:
800
  except requests.exceptions.HTTPError as e:
801
  if e.response.status_code == 404:
802
  gaia_logger.warning(f"File not found for task {task_id}: {file_url}")
803
- return None
804
  gaia_logger.warning(f"HTTP error fetching file {task_id}: {e}")
805
  except requests.exceptions.Timeout:
806
  gaia_logger.warning(f"Timeout fetching file {task_id}")
@@ -813,37 +829,37 @@ class GaiaLevel1Agent:
813
  def _clean_vqa_species_answer(self, answer_text: str) -> str:
814
  """Cleans and normalizes VQA answer to extract a potential species name."""
815
  if not answer_text: return ""
816
-
817
  cleaned = answer_text.lower().strip()
818
-
819
  # Remove common prefixes
820
  prefixes_to_remove = [
821
- "a type of ", "a variety of ", "it's a ", "it is a ", "an ", "a ", "the ",
822
  "this is a ", "this bird is a ", "it appears to be a ", "looks like a ",
823
  "it's an ", "it is an ", "this is an ", "this bird is an ", "it appears to be an ", "looks like an "
824
  ]
825
  for prefix in prefixes_to_remove:
826
  if cleaned.startswith(prefix):
827
  cleaned = cleaned[len(prefix):]
828
-
829
  # Remove common suffixes
830
  suffixes_to_remove = [" bird", " species"]
831
  for suffix in suffixes_to_remove:
832
  if cleaned.endswith(suffix):
833
  cleaned = cleaned[:-len(suffix)]
834
-
835
  # Remove parenthetical content or descriptive clauses if simple
836
  cleaned = re.sub(r"\s*\(.*\)\s*$", "", cleaned).strip() # e.g. "robin (american)" -> "robin"
837
  cleaned = re.sub(r",\s*which is.*$", "", cleaned).strip() # e.g. "sparrow, which is small" -> "sparrow"
838
 
839
  # Basic character filtering (allow letters, numbers for things like "Type 2", spaces, hyphens)
840
  cleaned = re.sub(r"[^a-z0-9\s\-]", "", cleaned).strip()
841
-
842
  # Normalize whitespace
843
  cleaned = " ".join(cleaned.split())
844
-
845
  # Filter out very generic or uncertain answers post-cleaning
846
- uncertain_terms = ["unknown", "not sure", "unclear", "difficult to say", "generic", "common bird"]
847
  if any(term in cleaned for term in uncertain_terms) or len(cleaned) < VIDEO_VQA_MIN_ANSWER_LENGTH:
848
  return "" # Return empty if too generic or short
849
 
@@ -858,45 +874,48 @@ class GaiaLevel1Agent:
858
  return "Video analysis skipped: Pillow library not available."
859
 
860
  detector = _get_video_object_detector()
861
- vqa_model = _get_video_vqa_pipeline() # Get VQA model
862
 
863
  if not detector or not vqa_model:
864
  return "Video analysis skipped: ML pipelines (detector or VQA) not available."
865
 
866
- video_file_path = None
867
- temp_dir = "temp_video_files_gaia_" + str(time.time()).replace(".","") # More unique temp dir
868
- os.makedirs(temp_dir, exist_ok=True)
 
869
 
870
  try:
 
 
 
 
871
  ydl_opts = {
872
- 'format': 'best[ext=mp4][height<=480]/best[ext=webm][height<=480]/bestvideo[height<=480]+bestaudio/best',
873
  'outtmpl': os.path.join(temp_dir, '%(id)s.%(ext)s'),
874
- 'quiet': True, # Quieter download
875
- 'max_filesize': 75 * 1024 * 1024,
876
  'overwrites': True, 'noprogress': True, 'noplaylist': True, 'socket_timeout': 20,
877
- 'postprocessors': [{'key': 'FFmpegExtractAudio', 'preferredcodec': 'mp3', 'preferredquality': '192',}] # helps ensure one file sometimes
 
878
  }
879
  gaia_logger.info(f"Attempting to download video: {video_url}")
880
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
881
- info_dict = ydl.extract_info(video_url, download=True)
882
- video_file_path = ydl.prepare_filename(info_dict)
883
- # yt-dlp might download separate video/audio and then merge.
884
- # Ensure we have the video file, not just an audio file if download failed partway.
885
- if not video_file_path.endswith(('.mp4', '.webm', '.mkv', '.flv', '.avi', '.mov')): # common video extensions
886
- possible_video_files = [f for f in os.listdir(temp_dir) if f.startswith(info_dict.get('id','')) and f.endswith(('.mp4', '.webm'))]
887
- if possible_video_files: video_file_path = os.path.join(temp_dir, possible_video_files[0])
888
- else:
889
- gaia_logger.error(f"Downloaded file '{video_file_path}' does not appear to be a video format.")
890
- # Attempt to find any video file in the temp_dir for this ID
891
- all_files = [os.path.join(temp_dir, f) for f in os.listdir(temp_dir) if info_dict.get('id','') in f]
892
- gaia_logger.debug(f"Files in temp_dir for video ID {info_dict.get('id','')}: {all_files}")
893
- # Clean up and report failure
894
- for f_cleanup in all_files:
895
- try: os.remove(f_cleanup)
896
- except Exception: pass
897
- try: os.rmdir(temp_dir)
898
- except Exception: pass
899
- return f"Video download resulted in a non-video file: {os.path.basename(video_file_path)}"
900
 
901
 
902
  if not video_file_path or not os.path.exists(video_file_path):
@@ -908,43 +927,46 @@ class GaiaLevel1Agent:
908
  cap = cv2.VideoCapture(video_file_path)
909
  if not cap.isOpened():
910
  gaia_logger.error(f"Cannot open video file: {video_file_path}")
911
- return "Cannot open video file."
 
912
 
913
  max_simultaneous_species = 0
914
  species_details_for_max_frame = ""
915
-
916
  total_frames_video = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
917
  fps = cap.get(cv2.CAP_PROP_FPS)
918
- if not fps or fps == 0: fps = 25 # Default fps
919
-
920
  frame_interval = max(1, int(fps)) # Process ~1 frame per second
921
-
922
  frames_analyzed_count = 0
923
  current_frame_num = 0
924
-
925
- gaia_logger.info(f"Video Info: ~{total_frames_video // fps:.0f}s, {fps:.2f} FPS. Analyzing ~1 frame/sec up to {VIDEO_MAX_FRAMES_TO_PROCESS} frames.")
926
 
927
  while cap.isOpened() and frames_analyzed_count < VIDEO_MAX_FRAMES_TO_PROCESS:
928
  cap.set(cv2.CAP_PROP_POS_FRAMES, current_frame_num) # Jump to frame
929
  ret, frame_data = cap.read()
930
  if not ret: break
931
 
932
- timestamp_sec = current_frame_num / fps
933
  gaia_logger.info(f"Processing frame {current_frame_num} (analyzed {frames_analyzed_count+1}/{VIDEO_MAX_FRAMES_TO_PROCESS}) at ~{timestamp_sec:.1f}s")
934
-
935
  try:
936
  pil_image = Image.fromarray(cv2.cvtColor(frame_data, cv2.COLOR_BGR2RGB))
937
  except Exception as e_conv:
938
  gaia_logger.warning(f"Frame {current_frame_num} conversion to PIL failed: {e_conv}")
939
  current_frame_num += frame_interval
940
  continue
941
-
942
  detected_objects = detector(pil_image)
943
  bird_crops_this_frame = []
944
  for obj in detected_objects:
945
- if obj['label'].lower() == 'bird' and obj['score'] > VIDEO_CONFIDENCE_THRESHOLD_BIRD: # Check lowercase label too
 
946
  box = obj['box']
947
  xmin, ymin, xmax, ymax = box['xmin'], box['ymin'], box['xmax'], box['ymax']
 
948
  if not (0 <= xmin < xmax <= pil_image.width and 0 <= ymin < ymax <= pil_image.height):
949
  gaia_logger.debug(f"Invalid box for bird: {box}, img size: {pil_image.size}")
950
  continue
@@ -953,6 +975,7 @@ class GaiaLevel1Agent:
953
  except Exception as e_crop:
954
  gaia_logger.warning(f"Cropping bird failed for box {box}: {e_crop}")
955
 
 
956
  if not bird_crops_this_frame:
957
  current_frame_num += frame_interval
958
  frames_analyzed_count += 1
@@ -963,45 +986,45 @@ class GaiaLevel1Agent:
963
  vqa_question = "What is the specific species of this bird?"
964
 
965
  for idx, bird_crop_img in enumerate(bird_crops_this_frame):
966
- if bird_crop_img.width < 20 or bird_crop_img.height < 20: continue
967
  try:
968
- vqa_answer_list = vqa_model(bird_crop_img, question=vqa_question, top_k=1) # Some models return list
969
-
970
  raw_vqa_answer_text = ""
971
- vqa_confidence = VIDEO_VQA_CONFIDENCE_THRESHOLD # Default if not provided
972
 
973
  if isinstance(vqa_answer_list, list) and vqa_answer_list:
974
  raw_vqa_answer_text = vqa_answer_list[0].get('answer', "")
975
  vqa_confidence = vqa_answer_list[0].get('score', vqa_confidence)
976
- elif isinstance(vqa_answer_list, dict): # Some pipelines might return dict directly
977
  raw_vqa_answer_text = vqa_answer_list.get('answer', "")
978
  vqa_confidence = vqa_answer_list.get('score', vqa_confidence)
979
 
980
  cleaned_species_name = self._clean_vqa_species_answer(raw_vqa_answer_text)
981
-
982
  if cleaned_species_name and vqa_confidence >= VIDEO_VQA_CONFIDENCE_THRESHOLD :
983
  frame_species_identified.add(cleaned_species_name)
984
  current_frame_species_details.append(f"{cleaned_species_name} (VQA conf: {vqa_confidence:.2f})")
985
- elif cleaned_species_name: # Log if below confidence
986
  gaia_logger.debug(f"VQA species '{cleaned_species_name}' (raw: '{raw_vqa_answer_text}') for bird {idx} below confidence {VIDEO_VQA_CONFIDENCE_THRESHOLD} (score: {vqa_confidence:.2f})")
987
  else:
988
  gaia_logger.debug(f"VQA for bird {idx} resulted in unusable/generic species: '{raw_vqa_answer_text}'")
989
 
990
  except Exception as e_vqa:
991
  gaia_logger.warning(f"VQA inference error for bird crop {idx} (frame {current_frame_num}): {e_vqa}")
992
-
993
  if len(frame_species_identified) > max_simultaneous_species:
994
  max_simultaneous_species = len(frame_species_identified)
995
  species_details_for_max_frame = f"At ~{timestamp_sec:.1f}s, inferred species: {', '.join(current_frame_species_details) if current_frame_species_details else 'None specific'}"
996
-
997
  if frame_species_identified:
998
  gaia_logger.info(f"Frame {current_frame_num} (~{timestamp_sec:.1f}s): Found {len(frame_species_identified)} distinct species types: {', '.join(list(frame_species_identified))}")
999
 
1000
  current_frame_num += frame_interval
1001
  frames_analyzed_count += 1
1002
-
1003
- cap.release()
1004
-
1005
  context_str = (f"Video analysis result: The highest number of distinct bird species types inferred simultaneously "
1006
  f"in the analyzed portion of the video (up to {VIDEO_MAX_FRAMES_TO_PROCESS} frames) was {max_simultaneous_species}. "
1007
  f"{('Details from a frame with this count: ' + species_details_for_max_frame) if species_details_for_max_frame else 'No specific species details captured for the max count frame or no birds found.'}")
@@ -1010,72 +1033,70 @@ class GaiaLevel1Agent:
1010
 
1011
  except yt_dlp.utils.DownloadError as e:
1012
  gaia_logger.error(f"yt-dlp download error for {video_url}: {str(e)}")
1013
- # Attempt to get a cleaner error message
1014
- msg_lines = str(e).splitlines()
1015
- clean_msg = msg_lines[-1] if msg_lines else str(e)
1016
- if "Unsupported URL" in str(e): clean_msg = "Unsupported video URL"
1017
- elif "video unavailable" in str(e).lower(): clean_msg = "Video is unavailable"
1018
- return f"Video download failed: {clean_msg}"
 
 
 
 
 
 
 
1019
  except Exception as e:
1020
  gaia_logger.error(f"Error during video analysis for {video_url}: {e}", exc_info=True)
1021
- return f"An unexpected error occurred during video analysis: {type(e).__name__} - {str(e)}"
1022
  finally:
1023
- if video_file_path and os.path.exists(video_file_path):
1024
- try: os.remove(video_file_path)
1025
- except Exception as e_remove: gaia_logger.warning(f"Could not remove temp video file {video_file_path}: {e_remove}")
1026
-
1027
- # Attempt to remove the temporary directory if it's empty
1028
- # This needs to be robust in case other files were created by yt-dlp or ffmpeg
1029
- try:
1030
- if os.path.exists(temp_dir):
1031
- # List all files in temp_dir to attempt removal if needed.
1032
- # For now, just try rmdir if it's truly empty, or log if not.
1033
- if not os.listdir(temp_dir):
1034
- os.rmdir(temp_dir)
1035
- gaia_logger.info(f"Removed empty temp video directory: {temp_dir}")
1036
- else:
1037
- # If not empty, it might contain other yt-dlp artifacts (like .part files, audio)
1038
- # For robustness in a contest, maybe leave it and rely on system temp cleaning,
1039
- # or implement more aggressive cleanup of all files within this specific temp_dir.
1040
- # For now, just log.
1041
- gaia_logger.warning(f"Temp video directory {temp_dir} not empty after processing. Manual cleanup might be needed for: {os.listdir(temp_dir)}")
1042
- except OSError as e_rmdir: # Catch OSError for rmdir failures (e.g. dir not empty)
1043
- gaia_logger.warning(f"Could not remove temp video directory {temp_dir} (possibly not empty or access issue): {e_rmdir}")
1044
- except Exception as e_final_clean:
1045
- gaia_logger.error(f"Unexpected error during final cleanup of {temp_dir}: {e_final_clean}")
1046
 
1047
 
1048
  def _parse_llm_output(self, llm_text: str) -> Dict[str, str]:
1049
- # ... (this method remains unchanged) ...
1050
  reasoning_trace = ""
1051
  model_answer = ""
1052
  final_answer_sentinel = "FINAL ANSWER:"
1053
-
1054
  parts = llm_text.split(final_answer_sentinel, 1)
1055
-
1056
  if len(parts) == 2:
1057
  reasoning_trace = parts[0].strip()
1058
  model_answer = parts[1].strip()
1059
  else:
1060
- reasoning_trace = llm_text
1061
  lines = llm_text.strip().split('\n')
1062
- model_answer = lines[-1].strip() if lines else "Could not parse answer"
 
 
 
 
 
1063
  gaia_logger.warning(f"LLM output did not contain '{final_answer_sentinel}'. Using fallback parsing. Full LLM text: '{llm_text[:200]}...'")
1064
 
1065
  return {"model_answer": model_answer, "reasoning_trace": reasoning_trace}
1066
 
1067
  def _formulate_answer_with_llm(self, question: str, file_context: Optional[str], web_context: Optional[str]) -> Dict[str, str]:
1068
- # ... (this method's prompt might need slight adjustment if video context phrasing changes, but core logic is fine) ...
1069
  default_model_answer = "Information not available in provided context"
1070
  default_reasoning = "LLM processing failed or context insufficient."
1071
 
1072
- if not self.llm_model:
1073
- gaia_logger.warning("LLM model (Gemini) not available for answer formulation.")
1074
- reasoning = "LLM model (Gemini) not available for answer formulation."
1075
  answer_val = default_model_answer
 
1076
  if web_context and file_context:
1077
  reasoning += " Context from file and web was found but not processed by LLM."
1078
- elif web_context: # web_context may now include video_context
1079
  reasoning += f" External context found: {web_context.splitlines()[0] if web_context.splitlines() else 'No specific snippet found.'}"
1080
  elif file_context:
1081
  reasoning += f" File context found: {file_context[:100]}..."
@@ -1083,8 +1104,9 @@ class GaiaLevel1Agent:
1083
  reasoning += " No context found."
1084
  return {"model_answer": answer_val, "reasoning_trace": reasoning}
1085
 
 
1086
  prompt_parts = [
1087
- "You are a general AI assistant. Your primary goal is to answer the user's question accurately and concisely based *only* on the provided context (from a document, web search results, or video analysis).",
1088
  "If the context comes from 'Video analysis result', understand that 'species types inferred' means the video was analyzed by an AI to identify birds and infer their species using visual question answering. The count refers to the maximum number of *distinct types* of birds identified in this way in any single analyzed video frame.",
1089
  "First, think step-by-step and briefly explain your reasoning based on the context. This part is for clarity and should come before your final answer.",
1090
  "After your reasoning, you MUST conclude your response with the exact phrase 'FINAL ANSWER:', followed by your answer on the same line or the next.",
@@ -1093,7 +1115,7 @@ class GaiaLevel1Agent:
1093
  " - If the answer is a string: use as few words as possible. Do not use articles (a, an, the) unless grammatically essential. Do not use abbreviations (e.g., write 'United States' not 'USA', 'Los Angeles' not 'LA') unless the question implies an abbreviation or it's a very common, universally understood one relevant to the context. Write digits in plain text (e.g., 'two' not '2') if they are part of a descriptive phrase, but use numerical digits if the question implies a code, identifier, version number, or a direct numerical value is more natural (e.g., 'Windows 10', 'part number 5').",
1094
  " - If the answer is a list of items: provide them as a comma-separated list (e.g., item1, item2, item3). Apply the number or string rules above to each element in the list.",
1095
  " - If the context is insufficient to answer the question: your reasoning should clearly state this, and your FINAL ANSWER should be 'Information not available in provided context'. Do not invent answers.",
1096
- "Prioritize information from 'Enriched Content' from web search results if available and relevant over shorter 'Snippets'. Information from 'Video Analysis Context' is highly specific to video-related questions.",
1097
  "\nUser Question: ", question
1098
  ]
1099
 
@@ -1102,81 +1124,113 @@ class GaiaLevel1Agent:
1102
  if file_context:
1103
  file_header = "\n\nContext from Provided Document:\n---"
1104
  file_footer = "\n---"
1105
- # Calculate available length more carefully
1106
  len_web_ctx = len(web_context) if web_context else 0
1107
- max_len_for_file = MAX_CONTEXT_LENGTH_LLM - current_prompt_text_len - len_web_ctx - len(file_header) - len(file_footer) - 500 # Buffer for LLM answer template
1108
-
1109
- if max_len_for_file > 100 :
1110
  truncated_file_context = file_context[:max_len_for_file]
1111
  if len(file_context) > len(truncated_file_context):
1112
  truncated_file_context += " ... (file context truncated)"
1113
  prompt_parts.extend([file_header, truncated_file_context, file_footer])
1114
  current_prompt_text_len += len(file_header) + len(truncated_file_context) + len(file_footer)
1115
  context_added = True
1116
- else: gaia_logger.warning(f"Not enough space for file context in LLM prompt. Needed {max_len_for_file}, available after other parts: {MAX_CONTEXT_LENGTH_LLM - current_prompt_text_len - len_web_ctx - len(file_header) - len(file_footer)}")
1117
 
1118
 
1119
- if web_context: # This can include video analysis context
1120
- # Determine header based on content
1121
  header_text = "\n\nContext from External Sources (Web/Video):\n---"
1122
- if "Video analysis result:" in web_context and "Source [" not in web_context:
1123
  header_text = "\n\nContext from Video Analysis:\n---"
1124
- elif "Source [" in web_context and "Video analysis result:" not in web_context:
1125
  header_text = "\n\nContext from Web Search Results:\n---"
1126
-
 
1127
  web_footer = "\n---"
1128
- # current_prompt_text_len already includes the base prompt_parts
1129
- # We need to calculate available length for web_context based on what's already added.
1130
- available_len_for_web = MAX_CONTEXT_LENGTH_LLM - current_prompt_text_len - len(header_text) - len(web_footer) - 300 # Buffer for LLM answer itself
1131
 
1132
- if available_len_for_web > 100:
1133
  truncated_web_context = web_context
1134
  if len(web_context) > available_len_for_web:
1135
  truncated_web_context = web_context[:available_len_for_web] + "\n... (external context truncated)"
1136
  gaia_logger.info(f"Truncated external (web/video) context from {len(web_context)} to {len(truncated_web_context)} chars for LLM.")
1137
  prompt_parts.extend([header_text, truncated_web_context, web_footer])
1138
- context_added = True # context_added flag might be set by file_context too
1139
- else: gaia_logger.warning(f"Not enough space for web/video context in LLM prompt. Needed {available_len_for_web}, available after other parts and file_ctx: {MAX_CONTEXT_LENGTH_LLM - current_prompt_text_len - len(header_text) - len(web_footer)}")
1140
 
1141
  if not context_added: prompt_parts.append("\n\nNo document, web, or video context could be provided due to length constraints or availability.")
1142
- prompt_parts.append("\n\nReasoning and Final Answer:")
1143
  final_prompt = "\n".join(prompt_parts)
1144
-
1145
  gaia_logger.info(f"LLM Prompt (first 300 chars): {final_prompt[:300]}...")
1146
  gaia_logger.info(f"LLM Total prompt length: {len(final_prompt)} chars.")
1147
 
1148
- if not GenerationConfig: # Should be caught by class init
1149
- return {"model_answer": "LLM configuration error", "reasoning_trace": "GenerationConfig class not available."}
1150
 
1151
  try:
1152
- gen_config = GenerationConfig(temperature=0.1, top_p=0.95, max_output_tokens=1024) # Reduced max output tokens slightly
1153
- safety_set = [{"category": c, "threshold": "BLOCK_MEDIUM_AND_ABOVE"} for c in ["HARM_CATEGORY_HARASSMENT", "HARM_CATEGORY_HATE_SPEECH", "HARM_CATEGORY_SEXUALLY_EXPLICIT", "HARM_CATEGORY_DANGEROUS_CONTENT"]]
1154
- response = self.llm_model.generate_content(final_prompt, generation_config=gen_config, safety_settings=safety_set)
1155
-
1156
- if not response.candidates or (hasattr(response, 'prompt_feedback') and response.prompt_feedback.block_reason):
1157
- reason_text = "Unknown"
1158
- if hasattr(response, 'prompt_feedback') and response.prompt_feedback.block_reason: reason_text = response.prompt_feedback.block_reason.name
1159
- gaia_logger.warning(f"Gemini response blocked. Reason: {reason_text}.")
1160
- # Provide more specific message if possible
1161
- block_details = ""
1162
- if hasattr(response, 'prompt_feedback') and response.prompt_feedback.safety_ratings:
1163
- block_details = "; ".join([f"{sr.category.name}: {sr.probability.name}" for sr in response.prompt_feedback.safety_ratings if sr.blocked])
1164
- return {"model_answer": "LLM Error: Response blocked", "reasoning_trace": f"My response was blocked by the LLM provider (Reason: {reason_text}). Details: {block_details}"}
1165
-
1166
-
1167
- llm_answer_text = response.text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1168
  gaia_logger.info(f"LLM Raw Full Answer (first 200 chars): {llm_answer_text[:200]}...")
1169
  return self._parse_llm_output(llm_answer_text)
1170
-
 
 
 
 
 
 
 
 
 
 
 
1171
  except Exception as e:
1172
  gaia_logger.error(f"Error calling Gemini API: {e}", exc_info=True)
1173
  error_type_name = type(e).__name__
1174
  error_message = str(e)
1175
  reasoning = f"Error calling Gemini API: {error_type_name} - {error_message}"
1176
  answer_val = "LLM API error"
1177
-
1178
- # Check for common API error types from google.generativeai.types.generation_types.BlockedPromptException or similar
1179
- # This requires inspecting the actual exception object 'e' or its attributes if it's a specific API exception type
1180
  if "API key" in error_message.lower() and ("invalid" in error_message.lower() or "not valid" in error_message.lower()):
1181
  answer_val = "LLM Auth Error"
1182
  reasoning = "LLM API key is invalid or not authorized."
@@ -1186,27 +1240,30 @@ class GaiaLevel1Agent:
1186
  elif "InternalServerError" in error_type_name or "500" in error_message :
1187
  answer_val = "LLM server error"
1188
  reasoning = "Error: LLM experienced an internal server error."
1189
- # Add more specific google.generativeai error handling if possible by inspecting 'e' type
1190
- # For example, if 'e' is an instance of google.generativeai.types.StopCandidateException for safety block
 
 
 
1191
 
1192
  return {"model_answer": answer_val, "reasoning_trace": reasoning}
1193
 
1194
  def __call__(self, question: str, task_id: Optional[str] = None) -> Dict[str, str]:
1195
  gaia_logger.info(f"Agent processing: '{question[:70]}...', TaskID: {task_id}")
1196
  q_lower = question.lower().strip()
1197
-
1198
  video_context_str: Optional[str] = None
1199
- # More specific regex to avoid matching general URLs in text that happen to have 'v='
1200
- video_url_match = re.search(r"(https?://(?:www\.)?(?:youtube\.com/watch\?v=|youtu\.be/)[\w\-=&%]+)", question)
1201
 
1202
 
1203
- # Keywords to trigger video analysis for bird species counting
1204
- video_keywords = ["video", "youtube.com", "youtu.be"]
1205
- species_keywords = ["species", "bird", "birds", "type of bird", "kinds of bird"]
1206
- action_keywords = ["count", "how many", "number of", "simultaneously", "at the same time", "on camera"]
1207
 
 
1208
  if video_url_match and \
1209
- any(vk in question for vk in video_keywords) and \
1210
  any(sk in q_lower for sk in species_keywords) and \
1211
  any(ak in q_lower for ak in action_keywords):
1212
  video_url = video_url_match.group(0)
@@ -1220,80 +1277,79 @@ class GaiaLevel1Agent:
1220
  return {"model_answer": "general AI assistant", "reasoning_trace": "User asked for my identity."}
1221
 
1222
  file_ctx_str: Optional[str] = None
1223
- # Check for file-related keywords more carefully
1224
- file_indicators = ["document", "file", "text provided", "attachment", "content of the file", "data in the file", "excel sheet", ".pdf", ".csv", ".txt", "audio file", "code snippet", "log file"]
1225
- # Question might imply a file without using the task_id directly if it refers to "the provided text" etc.
1226
- # For GAIA, task_id is usually present if a file is relevant.
1227
- if task_id and (any(fi in q_lower for fi in file_indicators) or "this task involves a file" in q_lower or "the provided" in q_lower):
1228
  file_ctx_str = self._fetch_and_process_file_content(task_id)
1229
  if file_ctx_str: gaia_logger.info(f"Processed file context ({len(file_ctx_str)} chars) for task {task_id}")
1230
  else: gaia_logger.warning(f"No file content or failed to process for task {task_id}")
1231
-
1232
- web_rag_ctx_str: Optional[str] = None # Separate for clarity
1233
  needs_web_rag = True
1234
-
1235
- if video_context_str:
1236
- # If video analysis provided a direct answer to a video-specific question, web RAG might be less critical or supplementary.
1237
- # Example: "Count birds in video X" -> video_context_str is primary.
1238
- # Example: "What is the habitat of birds shown in video X?" -> video context helps identify birds, web helps find habitat.
1239
- # For now, if video_context exists, we'll assume it might answer the question, but LLM can decide if web is still needed from prompt.
1240
- # Let's refine this: if the question *only* seems answerable by the video analysis itself, skip web.
1241
- if all(ak_q in q_lower for ak_q in ["count", "how many", "simultaneously"]) and any(sk_q in q_lower for sk_q in species_keywords):
1242
- needs_web_rag = False
1243
- gaia_logger.info("Video context seems primary for a counting question; RAG web search might be skipped or deprioritized by LLM.")
1244
-
1245
-
1246
- if file_ctx_str and len(file_ctx_str) > 300 and not video_context_str:
1247
- # If a substantial file context exists, and no video context, evaluate if web is needed.
1248
- # This logic remains similar.
1249
- web_still_needed_kws = ["what is", "who is", "current", "latest", "news", "public opinion", "recent events", "search for", "find information on", "browse", "look up", "compare with", "external information"]
1250
- doc_can_answer_kws = ["summarize", "according to the document", "in the provided text", "based on the file content"]
1251
  if any(kw in q_lower for kw in doc_can_answer_kws) and not any(kw in q_lower for kw in web_still_needed_kws):
1252
  needs_web_rag = False
1253
- elif not any(kw in q_lower for kw in web_still_needed_kws) and not question.endswith("?"): # Heuristic: statements might rely on file.
1254
- # More nuanced: if it's a question that clearly can't be answered by a generic file.
1255
- if not ("why" in q_lower or "how" in q_lower or "explain" in q_lower or "what if" in q_lower):
 
1256
  needs_web_rag = False
 
 
1257
 
1258
- # Explicit negative constraint for web search
1259
  if "don't search" in q_lower or "do not search" in q_lower or "without searching" in q_lower or "without using the internet" in q_lower:
1260
  needs_web_rag = False
1261
  gaia_logger.info("Web RAG explicitly disabled by user query.")
1262
-
1263
  if needs_web_rag:
1264
- search_q = question.replace("?", "").strip() # Basic cleaning
1265
- # Potentially refine search_q if file_context or video_context gives hints
1266
- # For now, use original question for RAG query
1267
- rag_res = self.rag_pipeline.analyze(query=search_q, force_refresh=False)
1268
  if rag_res:
1269
  snippets = []
1270
  for i, res_item in enumerate(rag_res):
1271
  title = res_item.get('title','N/A')
1272
  body = res_item.get('body','')
1273
  href = res_item.get('href','#')
1274
- provider = res_item.get('query_tag','WebSearch') # From RAG pipeline
1275
- prefix = "EnrichedContent" if res_item.get('enriched') else "Snippet"
1276
- body_preview = (body[:1500] + "...") if len(body) > 1500 else body # Max length for snippet
1277
- snippets.append(f"Source [{i+1} - {provider}]: {title}\nURL: {href}\n{prefix}: {body_preview}\n---")
1278
  web_rag_ctx_str = "\n\n".join(snippets)
1279
  if web_rag_ctx_str: gaia_logger.info(f"RAG pipeline yielded web results ({len(web_rag_ctx_str)} chars).")
1280
  else: gaia_logger.warning("RAG pipeline yielded no web results for the query.")
1281
-
1282
- # Consolidate external context for the LLM
1283
  final_llm_external_context_parts = []
1284
  if video_context_str:
1285
- final_llm_external_context_parts.append(f"Video Analysis Context:\n{video_context_str}")
1286
  if web_rag_ctx_str:
1287
- final_llm_external_context_parts.append(f"Web Search Context:\n{web_rag_ctx_str}")
1288
-
1289
- final_llm_external_context = "\n\n---\n\n".join(final_llm_external_context_parts) if final_llm_external_context_parts else None
1290
-
 
 
1291
  agent_response_dict = self._formulate_answer_with_llm(question, file_ctx_str, final_llm_external_context)
1292
  gaia_logger.info(f"LLM-based model_answer (first 70 chars): {agent_response_dict.get('model_answer', '')[:70]}...")
1293
  return agent_response_dict
1294
-
1295
 
1296
- def run_and_submit_all(profile: gr.OAuthProfile | None):
 
1297
  space_id = os.getenv("SPACE_ID")
1298
  if profile: username = f"{profile.username}"
1299
  else: return "Please Login to Hugging Face.", None
@@ -1306,11 +1362,15 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
1306
  questions_data = response.json()
1307
  if not questions_data or not isinstance(questions_data, list): return "Questions list empty/invalid.", None
1308
  except Exception as e: return f"Error fetching questions: {e}", None
1309
-
1310
  results_log_for_gradio, answers_for_api_submission = [], []
1311
- GEMINI_RPM_LIMIT = int(os.getenv("GEMINI_RPM_LIMIT", "60"))
1312
- sleep_llm = (60.0 / GEMINI_RPM_LIMIT) + 0.5 if GEMINI_RPM_LIMIT > 0 else 0.2
1313
-
 
 
 
 
1314
  for i, item in enumerate(questions_data):
1315
  task_id, q_text = item.get("task_id"), item.get("question")
1316
  model_answer_val = "AGENT ERROR"
@@ -1322,7 +1382,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
1322
  results_log_for_gradio.append({"Task ID": task_id, "Question": q_text, "Submitted Answer": model_answer_val, "Reasoning Trace": reasoning_trace_val})
1323
  answers_for_api_submission.append({"task_id": task_id, "submitted_answer": model_answer_val})
1324
  continue
1325
-
1326
  gaia_logger.info(f"Q {i+1}/{len(questions_data)} - Task: {task_id}")
1327
  try:
1328
  agent_response_dict = agent(question=q_text, task_id=task_id)
@@ -1332,24 +1392,24 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
1332
  gaia_logger.error(f"Error during agent call for task {task_id}: {e}", exc_info=True)
1333
  model_answer_val = "AGENT EXECUTION ERROR"
1334
  reasoning_trace_val = f"Agent call failed: {type(e).__name__} - {str(e)}"
1335
-
1336
  answers_for_api_submission.append({"task_id": task_id, "submitted_answer": model_answer_val})
1337
  results_log_for_gradio.append({"Task ID": task_id, "Question": q_text, "Submitted Answer": model_answer_val, "Reasoning Trace (first 500 chars)": reasoning_trace_val[:500] + ("..." if len(reasoning_trace_val) > 500 else "")})
1338
-
1339
  if i < len(questions_data) - 1: time.sleep(sleep_llm)
1340
-
1341
  if not answers_for_api_submission: return "Agent produced no answers for API submission.", pd.DataFrame(results_log_for_gradio or [{"Info": "No questions processed"}])
1342
-
1343
  submission_payload_for_api = {
1344
- "username": username.strip(),
1345
- "agent_code": agent_code,
1346
- "answers": answers_for_api_submission
1347
  }
1348
  gaia_logger.info(f"Submitting {len(answers_for_api_submission)} answers for '{username}' to API...")
1349
  gaia_logger.debug(f"API Submission Payload Sample: {json.dumps(submission_payload_for_api)[:500]}")
1350
 
1351
  try:
1352
- response = requests.post(submit_url, json=submission_payload_for_api, timeout=60);
1353
  response.raise_for_status()
1354
  result_data = response.json()
1355
  status = (f"Submission Successful!\nUser: {result_data.get('username')}\nScore: {result_data.get('score','N/A')}% "
@@ -1361,37 +1421,36 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
1361
  return f"Submission Failed: {err_detail}", pd.DataFrame(results_log_for_gradio)
1362
  except Exception as e: return f"Submission Failed: {e}", pd.DataFrame(results_log_for_gradio)
1363
 
1364
- with gr.Blocks(title="GAIA RAG Agent - Advanced") as demo:
1365
- gr.Markdown("# AGENT")
1366
  gr.Markdown(
1367
  """
1368
  **Instructions:**
1369
- 1. Log in to Hugging Face.
1370
- 2. Click 'Run Evaluation & Submit All Answers'.
1371
  ---
1372
- Agent uses RAG, advanced File Processing, and LLM.
1373
  """
1374
  )
1375
  gr.LoginButton()
1376
  run_button = gr.Button("Run Evaluation & Submit All Answers", variant="primary")
1377
  status_output = gr.Textbox(label="Status / Submission Result", lines=5, interactive=False)
1378
- results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
1379
  run_button.click(fn=run_and_submit_all, inputs=[], outputs=[status_output, results_table])
1380
 
1381
- if __name__ == "__main__":
1382
- print("\n" + "-"*30 + " GAIA Level 1 Agent - RAG, FileProc, Video Analysis " + "-"*30)
1383
  required_env = {
1384
- "GOOGLE_GEMINI_API_KEY": GOOGLE_GEMINI_API_KEY,
1385
- "GOOGLE_API_KEY": GOOGLE_CUSTOM_SEARCH_API_KEY,
1386
- "GOOGLE_CSE_ID": GOOGLE_CUSTOM_SEARCH_CSE_ID,
1387
  "TAVILY_API_KEY": TAVILY_API_KEY,
1388
  }
1389
  missing_keys = [k for k, v in required_env.items() if not v]
1390
  for k, v in required_env.items(): print(f"✅ {k} found." if v else f"⚠️ WARNING: {k} not set.")
1391
-
1392
- # Check for all critical libraries
1393
  libraries_to_check = [
1394
- ("transformers", hf_transformers_pipeline), ("torch", torch),
1395
  ("librosa", librosa), ("openpyxl", openpyxl), ("pdfplumber", pdfplumber),
1396
  ("yt_dlp", yt_dlp), ("cv2 (opencv-python)", cv2), ("BeautifulSoup", BeautifulSoup),
1397
  ("duckduckgo_search", DDGS), ("googleapiclient", build_google_search_service),
@@ -1402,6 +1461,11 @@ if __name__ == "__main__":
1402
 
1403
  if missing_keys: print(f"\n--- PLEASE SET MISSING ENV VARS FOR FULL FUNCTIONALITY: {', '.join(missing_keys)} ---\n")
1404
  else: print("\n--- All major API Key Environment Variables found. ---")
1405
-
 
 
 
 
 
1406
  print("-"*(60 + len(" GAIA Level 1 Agent - RAG, FileProc, Video Analysis ")) + "\n")
1407
  demo.launch(server_name="0.0.0.0", server_port=7860, debug=False, share=False)
 
15
  from concurrent.futures import ThreadPoolExecutor, as_completed
16
  from concurrent.futures import TimeoutError as FuturesTimeoutError
17
  from collections import defaultdict
18
+ import tempfile # Added for robust temporary directory management
19
 
20
  try:
21
  import google.generativeai as genai
22
+ from google.generativeai.types import GenerationConfig, HarmCategory, HarmBlockThreshold, FinishReason, HarmProbability
23
  except ImportError:
24
  genai = None
25
  GenerationConfig = None
26
+ HarmCategory = None # Added for safety settings/finish reason details
27
+ HarmBlockThreshold = None # Added for safety settings
28
+ FinishReason = None # Added for checking candidate finish reason
29
+ HarmProbability = None # Added for checking safety ratings probability
30
  print("WARNING: google-generativeai library not found. Install with: pip install google-generativeai")
31
 
32
  try:
 
70
  print("WARNING: librosa library not found. Audio processing may be impaired. Install with: pip install librosa")
71
 
72
  try:
73
+ import openpyxl
74
  except ImportError:
75
  openpyxl = None
76
  print("WARNING: openpyxl library not found. .xlsx file processing might fail. Install with: pip install openpyxl")
 
108
  TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
109
 
110
  AGENT_DEFAULT_TIMEOUT = 15
111
+ MAX_CONTEXT_LENGTH_LLM = 30000
112
 
113
+ MAX_FILE_SIZE = 5 * 1024 * 1024
114
  CSV_SAMPLE_ROWS = 10
115
+ MAX_FILE_CONTEXT_LENGTH = 10000
116
 
117
  # Global instances for video analysis pipelines
118
  video_object_detector_pipeline: Optional[Any] = None
119
  video_vqa_pipeline: Optional[Any] = None # Changed from species_classifier to VQA
120
 
121
+ VIDEO_ANALYSIS_DEVICE: int = -1
122
  VIDEO_ANALYSIS_OBJECT_MODEL = "facebook/detr-resnet-50"
123
  VIDEO_ANALYSIS_VQA_MODEL = "Salesforce/blip-vqa-capfilt-large" # Using a VQA model
124
 
125
+ VIDEO_MAX_FRAMES_TO_PROCESS = 120
126
  VIDEO_CONFIDENCE_THRESHOLD_BIRD = 0.6
127
  VIDEO_VQA_MIN_ANSWER_LENGTH = 3 # Minimum length for a VQA answer to be considered a species
128
+ VIDEO_VQA_CONFIDENCE_THRESHOLD = 0.3
129
 
130
  asr_pipeline_instance: Optional[Any] = None
131
+ ASR_MODEL_NAME = "openai/whisper-tiny"
132
+ ASR_PROCESSING_TIMEOUT_SECONDS = 1024
133
 
134
 
135
  DEFAULT_RAG_CONFIG = {
 
141
  'tavily_api_key': TAVILY_API_KEY,
142
  'default_max_results': 3, 'retry_attempts': 2, 'retry_delay': 2,
143
  'google_timeout': 8, 'tavily_depth': "basic",
144
+ 'max_query_length_tavily': 380
145
  },
146
  'processing': {
147
  'trusted_sources': {'wikipedia.org': 0.8, 'reuters.com': 0.75, 'apnews.com': 0.75},
 
170
  # Simplified device selection, consistent with FileProcessor's ASR
171
  device_id = 0 if torch.cuda.is_available() else -1
172
  if VIDEO_ANALYSIS_DEVICE == -1 : VIDEO_ANALYSIS_DEVICE = device_id # Set global if not user-overridden
173
+
174
  target_device = VIDEO_ANALYSIS_DEVICE if VIDEO_ANALYSIS_DEVICE != -1 else device_id
175
 
176
  video_object_detector_pipeline = hf_transformers_pipeline(
 
190
  try:
191
  device_id = 0 if torch.cuda.is_available() else -1
192
  if VIDEO_ANALYSIS_DEVICE == -1: VIDEO_ANALYSIS_DEVICE = device_id
193
+
194
  target_device = VIDEO_ANALYSIS_DEVICE if VIDEO_ANALYSIS_DEVICE != -1 else device_id
195
 
196
  video_vqa_pipeline = hf_transformers_pipeline(
 
210
  global asr_pipeline_instance
211
  if asr_pipeline_instance is None and hf_transformers_pipeline and torch:
212
  try:
213
+ device = -1
214
  asr_pipeline_instance = hf_transformers_pipeline(
215
  "automatic-speech-recognition",
216
  model=ASR_MODEL_NAME,
 
292
  f"Columns: {', '.join(df.columns)}\nFirst {min(CSV_SAMPLE_ROWS, len(df))} sample rows:\n{df.head(CSV_SAMPLE_ROWS).to_markdown(index=False)}"
293
  )
294
  return FileProcessor._truncate_text(summary, filename, "CSV")
295
+ except Exception as e:
296
  if "tabulate" in str(e).lower() and df is not None:
297
  gaia_logger.error(f"CSV to_markdown error for '{filename}' (missing tabulate): {e}", exc_info=False)
298
  try:
 
335
  break
336
  except UnicodeDecodeError: continue
337
  if text is None: text = content.decode('utf-8', errors='ignore')
338
+
339
  summary = f"Text Document: '{filename}':\n{text}"
340
  return FileProcessor._truncate_text(summary, filename, "Text")
341
  except Exception as e:
 
346
  gaia_logger.info(f"Processing Excel file: {filename}")
347
  if not openpyxl: return f"Error: Excel processing skipped for '{filename}', openpyxl library not available."
348
  xls = None
349
+ df_list_for_fallback = []
350
  try:
351
  xls = pd.ExcelFile(io.BytesIO(content), engine='openpyxl')
352
  summary_parts = [f"Excel Document Summary: '{filename}'"]
353
  for sheet_name in xls.sheet_names:
354
  df = xls.parse(sheet_name)
355
+ df_list_for_fallback.append((sheet_name, df))
356
  sheet_summary = (
357
  f"\n---\nSheet: '{sheet_name}' ({len(df)} rows, {len(df.columns)} columns):\n"
358
  f"Columns: {', '.join(df.columns)}\nFirst {min(CSV_SAMPLE_ROWS, len(df))} sample rows:\n{df.head(CSV_SAMPLE_ROWS).to_markdown(index=False)}"
 
363
  break
364
  full_summary = "".join(summary_parts)
365
  return FileProcessor._truncate_text(full_summary, filename, "Excel")
366
+ except Exception as e:
367
  if "tabulate" in str(e).lower():
368
  gaia_logger.error(f"Excel to_markdown error for '{filename}' (missing tabulate): {e}", exc_info=False)
369
  try:
370
  summary_parts_fallback = [f"Excel Document Summary: '{filename}'"]
371
+ if not df_list_for_fallback and xls:
372
  for sheet_name in xls.sheet_names:
373
  df_list_for_fallback.append((sheet_name, xls.parse(sheet_name)))
374
+ elif not xls and not df_list_for_fallback: # Ensure df_list_for_fallback is populated if xls parsing failed early
375
  temp_xls = pd.ExcelFile(io.BytesIO(content), engine='openpyxl')
376
  for sheet_name in temp_xls.sheet_names:
377
  df_list_for_fallback.append((sheet_name, temp_xls.parse(sheet_name)))
378
 
379
+
380
  for sheet_name_fb, df_fb in df_list_for_fallback:
381
  sheet_summary_fallback = (
382
  f"\n---\nSheet: '{sheet_name_fb}' ({len(df_fb)} rows, {len(df_fb.columns)} columns):\n"
 
406
  page_text = page.extract_text()
407
  if page_text:
408
  text_content += page_text + "\n"
409
+ if len(text_content) > MAX_FILE_CONTEXT_LENGTH * 1.2: # Check slightly over to allow truncation logic to handle it
410
  break
411
  if not text_content:
412
  return f"PDF Document: '{filename}'. No text could be extracted or PDF is empty."
 
418
  @staticmethod
419
  def _perform_asr_transcription(asr_pipeline_ref, audio_data_np, filename_for_log):
420
  gaia_logger.info(f"ASR: Starting transcription for {filename_for_log} in thread.")
421
+
422
  return asr_pipeline_ref(audio_data_np, chunk_length_s=30, return_timestamps=False, generate_kwargs={"language": "en"})
423
 
424
 
 
430
  return f"Error: Audio processing skipped for '{filename}', ASR pipeline not available."
431
  if not librosa:
432
  return f"Error: Audio processing skipped for '{filename}', librosa library not available."
433
+
434
  try:
435
  with io.BytesIO(content) as audio_buffer:
436
  y, sr = librosa.load(audio_buffer, sr=16000, mono=True)
437
+
438
  duration_seconds = len(y) / sr
439
  gaia_logger.info(f"Audio file: {filename}, Duration: {duration_seconds:.2f} seconds. Timeout set to: {ASR_PROCESSING_TIMEOUT_SECONDS}s")
440
  start_time = time.time()
 
448
  except FuturesTimeoutError:
449
  gaia_logger.warning(f"ASR transcription for '{filename}' timed out after {ASR_PROCESSING_TIMEOUT_SECONDS} seconds.")
450
  return f"Error: Audio transcription for '{filename}' timed out after {ASR_PROCESSING_TIMEOUT_SECONDS}s."
451
+ except Exception as e_thread:
452
  gaia_logger.error(f"ASR transcription thread for '{filename}' failed: {e_thread}", exc_info=True)
453
  if "3000 mel input features" in str(e_thread) or "return_timestamps" in str(e_thread):
454
  return f"Error processing Audio file '{filename}': Transcription failed due to long-form audio issue (mel features/timestamps). Original error: {str(e_thread)}"
 
459
 
460
  if not transcribed_text.strip():
461
  return f"Audio Document: '{filename}'. Transcription result was empty or ASR failed."
462
+
463
  summary = f"Audio Document (Transcription): '{filename}':\n{transcribed_text}"
464
  return FileProcessor._truncate_text(summary, filename, "Audio Transcription")
465
 
 
478
  except Exception:
479
  return f"File with Unknown Content Type: '{filename}'. Content is likely binary and cannot be displayed as text."
480
 
481
+ class CacheManager:
482
  def __init__(self, ttl: int = 300, max_size: int = 100, name: str = "Cache"):
483
  self.ttl = ttl; self.max_size = max_size
484
  self._cache: Dict[Any, Any] = {}; self._timestamps: Dict[Any, float] = {}
 
489
  try:
490
  self._access_order.remove(key); self._access_order.append(key)
491
  return copy.deepcopy(self._cache[key])
492
+ except (ValueError, TypeError) as e: # pragma: no cover
493
  self.delete(key); return None
494
+ elif key in self._cache: # Expired
495
  self.delete(key)
496
  return None
497
  def set(self, key: Any, value: Any):
498
+ if key in self._cache: self.delete(key) # Remove to update timestamp and order
499
  while len(self._cache) >= self.max_size and self._access_order:
500
  old_key = self._access_order.pop(0)
501
+ if old_key in self._cache: # Should always be true
502
  del self._cache[old_key]; del self._timestamps[old_key]
503
  try: self._cache[key] = copy.deepcopy(value)
504
+ except TypeError: self._cache[key] = value # For non-deep-copyable items
505
  self._timestamps[key] = time.time(); self._access_order.append(key)
506
  def delete(self, key: Any):
507
  if key in self._cache:
508
  try:
509
  del self._cache[key]; del self._timestamps[key]
510
  if key in self._access_order: self._access_order.remove(key)
511
+ except (ValueError, KeyError): pass # pragma: no cover
512
  def clear(self): self._cache.clear();self._timestamps.clear();self._access_order.clear();gaia_logger.info(f"[{self.name}] Cleared.")
513
  def __len__(self): return len(self._cache)
514
  def __contains__(self, key): return key in self._cache and (time.time()-self._timestamps.get(key,0)<self.ttl)
515
 
516
+ class SearchProvider(ABC):
517
  def __init__(self, config_dict: Dict):
518
  self.provider_config = config_dict.get('search', {})
519
  self._enabled = False
 
538
  return self._perform_search(query, max_results)
539
  def available(self) -> bool: return self._enabled
540
 
541
+ class GoogleProvider(SearchProvider):
542
  @property
543
  def provider_name(self) -> str: return "Google"
544
  def __init__(self, config_dict: Dict):
 
560
  except requests.exceptions.RequestException as e: gaia_logger.warning(f"[{self.provider_name}] RequestEx: '{query[:70]}': {e}"); return None
561
  except Exception as e: gaia_logger.error(f"[{self.provider_name}] Error: '{query[:70]}': {e}", exc_info=True); return None
562
 
563
+ class TavilyProvider(SearchProvider):
564
  @property
565
  def provider_name(self) -> str: return "Tavily"
566
  def __init__(self, config_dict: Dict):
 
585
  return [{'href': h.get('url'), 'title': h.get('title',''), 'body': h.get('content','')} for h in hits]
586
  except Exception as e: gaia_logger.warning(f"[{self.provider_name}] Search fail: '{query[:70]}': {e}"); return None
587
 
588
+ class DuckDuckGoProvider(SearchProvider):
589
  @property
590
  def provider_name(self) -> str: return "DuckDuckGo"
591
  def __init__(self, config_dict: Dict):
 
602
  return [{'href': r.get('href'), 'title': r.get('title',''), 'body': r.get('body','')} for r in hits]
603
  except Exception as e: gaia_logger.warning(f"[{self.provider_name}] Search fail: '{query[:70]}': {e}"); return None
604
 
605
+ class CompositeSearchClient:
606
  def __init__(self, config_dict: Dict):
607
+ self.config = config_dict
608
  self._search_config = config_dict.get('search', {})
609
  self.providers = self._init_providers(config_dict)
610
  self.cache = CacheManager(
 
640
  results = prov.search(q, actual_r)
641
  if results is not None: self.cache.set(cache_key, results); return results
642
  if attempt < self._retry_att: time.sleep(self._retry_del)
643
+ except Exception as e: # pragma: no cover
644
  if attempt < self._retry_att: time.sleep(self._retry_del)
645
  self.cache.set(cache_key, [])
646
  return []
647
 
648
+ class GaiaQueryBuilder:
649
  def __init__(self, base_query: str, config_dict: Dict):
650
  self.base_query = base_query.strip()
651
  self.config = config_dict
652
  def get_queries(self) -> Dict[str, List[Tuple[str, str]]]:
653
  return {'primary': [(self.base_query, 'GENERAL')]} if self.base_query else {'primary': []}
654
 
655
+ class ResultProcessor:
656
  def __init__(self, config_dict: Dict):
657
  self.proc_config = config_dict.get('processing', {})
658
  self.trusted_sources = self.proc_config.get('trusted_sources', {})
659
  self.seen_urls: Set[str] = set()
660
+ self.date_pattern = DEFAULT_RAG_CONFIG['processing'].get('date_pattern', r'\b\d{4}\b')
661
  def process_batch(self, results: List[Dict], query_tag: str, initial_cat: str='GENERAL') -> List[Dict]:
662
  processed: List[Dict] = []
663
  if not results: return processed
 
681
  result['temporal_relevance'] = temporal_r
682
  result['combined_score'] = (source_q * 0.6 + temporal_r * 0.4)
683
 
684
+ class ContentEnricher:
685
  def __init__(self, config_dict: Dict):
686
  self.enrich_config = config_dict.get('enrichment', {})
687
  self._enabled = self.enrich_config.get('enabled', False) and bool(BeautifulSoup)
 
723
  except Exception as e: result['enrichment_failed'] = type(e).__name__
724
  return result
725
 
726
+ class GeneralRAGPipeline:
727
  def __init__(self, config_dict: Optional[Dict] = None):
728
  self.config = config_dict if config_dict is not None else DEFAULT_RAG_CONFIG
729
  self.search_client = CompositeSearchClient(self.config)
 
741
  max_r_pq = cfg_search.get('default_max_results', 3)
742
  cache_key = (q, max_r_pq, total_lim, enrich_en, enrich_cnt)
743
  if not force_refresh and (cached := self.pipeline_cache.get(cache_key)) is not None: return cached
744
+ if force_refresh: self.search_client.cache.clear(); # Clears underlying search client cache
745
+ if self.enricher and force_refresh: self.enricher.cache.clear() # Clear enricher cache if force_refresh
746
  all_res, res_proc = [], ResultProcessor(self.config)
747
  staged_qs = GaiaQueryBuilder(q, self.config).get_queries()
748
  for stage, qs_in_stage in staged_qs.items():
749
  for query_s, cat in qs_in_stage:
750
+ if len(all_res) >= total_lim * 2: break # Fetch more initially to allow for better selection
751
  s_res = self.search_client.search(query_s, max_results=max_r_pq, force_refresh=force_refresh)
752
  all_res.extend(res_proc.process_batch(s_res or [], query_s, initial_cat=cat))
753
  all_res.sort(key=lambda x: x.get('combined_score', 0), reverse=True)
 
764
  self.api_url = api_url
765
  self.llm_model: Optional[Any] = None
766
  self.rag_pipeline = GeneralRAGPipeline(DEFAULT_RAG_CONFIG)
767
+
768
  if genai and GOOGLE_GEMINI_API_KEY:
769
  try:
770
  genai.configure(api_key=GOOGLE_GEMINI_API_KEY)
771
  model_name = 'gemini-2.5-flash-preview-05-20'
772
+
773
+
774
  self.llm_model = genai.GenerativeModel(model_name)
775
  gaia_logger.info(f"Gemini LLM ('{model_name}') initialized.")
776
  except Exception as e:
777
  gaia_logger.error(f"Error initializing Gemini LLM: {e}", exc_info=True)
778
+ # Attempt fallback if specific model fails (e.g. not available in region, or name typo)
779
+ try:
780
+ gaia_logger.info("Attempting fallback to 'gemini-1.0-pro' for LLM.")
781
+ self.llm_model = genai.GenerativeModel('gemini-1.0-pro') # A common, generally available model
782
+ gaia_logger.info("Gemini LLM ('gemini-1.0-pro') initialized as fallback.")
783
+ except Exception as e_fallback:
784
+ gaia_logger.error(f"Fallback LLM initialization also failed: {e_fallback}", exc_info=True)
785
+
786
  else:
787
  gaia_logger.warning("Gemini LLM dependencies or API key missing.")
788
+
789
  if not self.llm_model:
790
  gaia_logger.warning("LLM (Gemini) unavailable. Limited capabilities.")
791
+
792
  _get_video_object_detector()
793
+ _get_video_vqa_pipeline()
794
 
795
  gaia_logger.info(f"GaiaLevel1Agent (RAG, FileProcessor, VideoAnalysis) initialized. API: {self.api_url}")
796
 
797
  @lru_cache(maxsize=32)
798
  def _fetch_and_process_file_content(self, task_id: str) -> Optional[str]:
799
+
800
  file_url = f"{self.api_url}/files/{task_id}"
801
+ for attempt in range(2): # Retry once
802
  try:
803
  response = requests.get(file_url, timeout=AGENT_DEFAULT_TIMEOUT)
804
  response.raise_for_status()
805
+
806
+ filename = FileProcessor._get_filename_from_url(response.url) # Fallback from URL
807
  content_disposition = response.headers.get('Content-Disposition')
808
  if content_disposition:
809
  header_filename = FileProcessor._get_filename_from_url(content_disposition)
 
816
  except requests.exceptions.HTTPError as e:
817
  if e.response.status_code == 404:
818
  gaia_logger.warning(f"File not found for task {task_id}: {file_url}")
819
+ return None # No point retrying 404
820
  gaia_logger.warning(f"HTTP error fetching file {task_id}: {e}")
821
  except requests.exceptions.Timeout:
822
  gaia_logger.warning(f"Timeout fetching file {task_id}")
 
829
  def _clean_vqa_species_answer(self, answer_text: str) -> str:
830
  """Cleans and normalizes VQA answer to extract a potential species name."""
831
  if not answer_text: return ""
832
+
833
  cleaned = answer_text.lower().strip()
834
+
835
  # Remove common prefixes
836
  prefixes_to_remove = [
837
+ "a type of ", "a variety of ", "it's a ", "it is a ", "an ", "a ", "the ",
838
  "this is a ", "this bird is a ", "it appears to be a ", "looks like a ",
839
  "it's an ", "it is an ", "this is an ", "this bird is an ", "it appears to be an ", "looks like an "
840
  ]
841
  for prefix in prefixes_to_remove:
842
  if cleaned.startswith(prefix):
843
  cleaned = cleaned[len(prefix):]
844
+
845
  # Remove common suffixes
846
  suffixes_to_remove = [" bird", " species"]
847
  for suffix in suffixes_to_remove:
848
  if cleaned.endswith(suffix):
849
  cleaned = cleaned[:-len(suffix)]
850
+
851
  # Remove parenthetical content or descriptive clauses if simple
852
  cleaned = re.sub(r"\s*\(.*\)\s*$", "", cleaned).strip() # e.g. "robin (american)" -> "robin"
853
  cleaned = re.sub(r",\s*which is.*$", "", cleaned).strip() # e.g. "sparrow, which is small" -> "sparrow"
854
 
855
  # Basic character filtering (allow letters, numbers for things like "Type 2", spaces, hyphens)
856
  cleaned = re.sub(r"[^a-z0-9\s\-]", "", cleaned).strip()
857
+
858
  # Normalize whitespace
859
  cleaned = " ".join(cleaned.split())
860
+
861
  # Filter out very generic or uncertain answers post-cleaning
862
+ uncertain_terms = ["unknown", "not sure", "unclear", "difficult to say", "generic", "common bird", "no bird", "not a bird"]
863
  if any(term in cleaned for term in uncertain_terms) or len(cleaned) < VIDEO_VQA_MIN_ANSWER_LENGTH:
864
  return "" # Return empty if too generic or short
865
 
 
874
  return "Video analysis skipped: Pillow library not available."
875
 
876
  detector = _get_video_object_detector()
877
+ vqa_model = _get_video_vqa_pipeline()
878
 
879
  if not detector or not vqa_model:
880
  return "Video analysis skipped: ML pipelines (detector or VQA) not available."
881
 
882
+ video_file_path: Optional[str] = None
883
+ temp_dir_obj: Optional[tempfile.TemporaryDirectory] = None
884
+ cap: Optional[cv2.VideoCapture] = None
885
+
886
 
887
  try:
888
+ temp_dir_obj = tempfile.TemporaryDirectory(prefix="gaia_video_")
889
+ temp_dir = temp_dir_obj.name
890
+ gaia_logger.info(f"Created temporary directory for video: {temp_dir}")
891
+
892
  ydl_opts = {
893
+ 'format': 'bestvideo[height<=480][ext=mp4]+bestaudio[ext=m4a]/bestvideo[height<=480][ext=webm]+bestaudio[ext=webm]/best[height<=480][ext=mp4]/best[height<=480][ext=webm]/best[height<=480]',
894
  'outtmpl': os.path.join(temp_dir, '%(id)s.%(ext)s'),
895
+ 'quiet': True,
896
+ 'max_filesize': 75 * 1024 * 1024,
897
  'overwrites': True, 'noprogress': True, 'noplaylist': True, 'socket_timeout': 20,
898
+ 'merge_output_format': 'mp4', # Encourage mp4 output if merging
899
+ # Removed 'postprocessors': [{'key': 'FFmpegExtractAudio', ...}]
900
  }
901
  gaia_logger.info(f"Attempting to download video: {video_url}")
902
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
903
+ info_dict = ydl.extract_info(video_url, download=True)
904
+ video_file_path = ydl.prepare_filename(info_dict) # Get the final path
905
+
906
+ # Check if downloaded file is indeed a video format recognised by OpenCV
907
+ # Common video extensions that OpenCV usually handles well.
908
+ # This check is made more robust by also trying to open it.
909
+ if not video_file_path or not any(video_file_path.lower().endswith(ext) for ext in ['.mp4', '.webm', '.avi', '.mkv', '.mov', '.flv']):
910
+ gaia_logger.warning(f"Downloaded file '{video_file_path}' might not be a standard video format or download failed to produce one. Will attempt to open.")
911
+ # Try to find a plausible video file if the main one looks suspicious
912
+ possible_video_files = [f for f in os.listdir(temp_dir) if f.startswith(info_dict.get('id','')) and any(f.lower().endswith(ext) for ext in ['.mp4', '.webm'])]
913
+ if possible_video_files:
914
+ video_file_path = os.path.join(temp_dir, possible_video_files[0])
915
+ gaia_logger.info(f"Using alternative video file from temp_dir: {video_file_path}")
916
+ # else: # The cap.isOpened() check below will handle if it's truly unusable
917
+ # gaia_logger.error(f"No suitable video file found in temp_dir for {info_dict.get('id','')}")
918
+ # return "Video download resulted in a non-video or unusable file."
 
 
 
919
 
920
 
921
  if not video_file_path or not os.path.exists(video_file_path):
 
927
  cap = cv2.VideoCapture(video_file_path)
928
  if not cap.isOpened():
929
  gaia_logger.error(f"Cannot open video file: {video_file_path}")
930
+ return f"Cannot open video file: {os.path.basename(video_file_path if video_file_path else 'N/A')}"
931
+
932
 
933
  max_simultaneous_species = 0
934
  species_details_for_max_frame = ""
935
+
936
  total_frames_video = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
937
  fps = cap.get(cv2.CAP_PROP_FPS)
938
+ if not fps or fps <= 0: fps = 25 # Default fps if detection fails or is zero
939
+
940
  frame_interval = max(1, int(fps)) # Process ~1 frame per second
941
+
942
  frames_analyzed_count = 0
943
  current_frame_num = 0
944
+
945
+ gaia_logger.info(f"Video Info: ~{total_frames_video // fps if fps > 0 else total_frames_video:.0f}s, {fps:.2f} FPS. Analyzing ~1 frame/sec up to {VIDEO_MAX_FRAMES_TO_PROCESS} frames.")
946
 
947
  while cap.isOpened() and frames_analyzed_count < VIDEO_MAX_FRAMES_TO_PROCESS:
948
  cap.set(cv2.CAP_PROP_POS_FRAMES, current_frame_num) # Jump to frame
949
  ret, frame_data = cap.read()
950
  if not ret: break
951
 
952
+ timestamp_sec = current_frame_num / fps if fps > 0 else frames_analyzed_count # Fallback timestamp if fps is bad
953
  gaia_logger.info(f"Processing frame {current_frame_num} (analyzed {frames_analyzed_count+1}/{VIDEO_MAX_FRAMES_TO_PROCESS}) at ~{timestamp_sec:.1f}s")
954
+
955
  try:
956
  pil_image = Image.fromarray(cv2.cvtColor(frame_data, cv2.COLOR_BGR2RGB))
957
  except Exception as e_conv:
958
  gaia_logger.warning(f"Frame {current_frame_num} conversion to PIL failed: {e_conv}")
959
  current_frame_num += frame_interval
960
  continue
961
+
962
  detected_objects = detector(pil_image)
963
  bird_crops_this_frame = []
964
  for obj in detected_objects:
965
+ # Check label case-insensitively
966
+ if obj['label'].lower() == 'bird' and obj['score'] > VIDEO_CONFIDENCE_THRESHOLD_BIRD:
967
  box = obj['box']
968
  xmin, ymin, xmax, ymax = box['xmin'], box['ymin'], box['xmax'], box['ymax']
969
+ # Ensure box coordinates are valid
970
  if not (0 <= xmin < xmax <= pil_image.width and 0 <= ymin < ymax <= pil_image.height):
971
  gaia_logger.debug(f"Invalid box for bird: {box}, img size: {pil_image.size}")
972
  continue
 
975
  except Exception as e_crop:
976
  gaia_logger.warning(f"Cropping bird failed for box {box}: {e_crop}")
977
 
978
+
979
  if not bird_crops_this_frame:
980
  current_frame_num += frame_interval
981
  frames_analyzed_count += 1
 
986
  vqa_question = "What is the specific species of this bird?"
987
 
988
  for idx, bird_crop_img in enumerate(bird_crops_this_frame):
989
+ if bird_crop_img.width < 20 or bird_crop_img.height < 20: continue
990
  try:
991
+ vqa_answer_list = vqa_model(bird_crop_img, question=vqa_question, top_k=1)
992
+
993
  raw_vqa_answer_text = ""
994
+ vqa_confidence = VIDEO_VQA_CONFIDENCE_THRESHOLD # Default
995
 
996
  if isinstance(vqa_answer_list, list) and vqa_answer_list:
997
  raw_vqa_answer_text = vqa_answer_list[0].get('answer', "")
998
  vqa_confidence = vqa_answer_list[0].get('score', vqa_confidence)
999
+ elif isinstance(vqa_answer_list, dict):
1000
  raw_vqa_answer_text = vqa_answer_list.get('answer', "")
1001
  vqa_confidence = vqa_answer_list.get('score', vqa_confidence)
1002
 
1003
  cleaned_species_name = self._clean_vqa_species_answer(raw_vqa_answer_text)
1004
+
1005
  if cleaned_species_name and vqa_confidence >= VIDEO_VQA_CONFIDENCE_THRESHOLD :
1006
  frame_species_identified.add(cleaned_species_name)
1007
  current_frame_species_details.append(f"{cleaned_species_name} (VQA conf: {vqa_confidence:.2f})")
1008
+ elif cleaned_species_name:
1009
  gaia_logger.debug(f"VQA species '{cleaned_species_name}' (raw: '{raw_vqa_answer_text}') for bird {idx} below confidence {VIDEO_VQA_CONFIDENCE_THRESHOLD} (score: {vqa_confidence:.2f})")
1010
  else:
1011
  gaia_logger.debug(f"VQA for bird {idx} resulted in unusable/generic species: '{raw_vqa_answer_text}'")
1012
 
1013
  except Exception as e_vqa:
1014
  gaia_logger.warning(f"VQA inference error for bird crop {idx} (frame {current_frame_num}): {e_vqa}")
1015
+
1016
  if len(frame_species_identified) > max_simultaneous_species:
1017
  max_simultaneous_species = len(frame_species_identified)
1018
  species_details_for_max_frame = f"At ~{timestamp_sec:.1f}s, inferred species: {', '.join(current_frame_species_details) if current_frame_species_details else 'None specific'}"
1019
+
1020
  if frame_species_identified:
1021
  gaia_logger.info(f"Frame {current_frame_num} (~{timestamp_sec:.1f}s): Found {len(frame_species_identified)} distinct species types: {', '.join(list(frame_species_identified))}")
1022
 
1023
  current_frame_num += frame_interval
1024
  frames_analyzed_count += 1
1025
+
1026
+ # cap.release() should be in finally
1027
+
1028
  context_str = (f"Video analysis result: The highest number of distinct bird species types inferred simultaneously "
1029
  f"in the analyzed portion of the video (up to {VIDEO_MAX_FRAMES_TO_PROCESS} frames) was {max_simultaneous_species}. "
1030
  f"{('Details from a frame with this count: ' + species_details_for_max_frame) if species_details_for_max_frame else 'No specific species details captured for the max count frame or no birds found.'}")
 
1033
 
1034
  except yt_dlp.utils.DownloadError as e:
1035
  gaia_logger.error(f"yt-dlp download error for {video_url}: {str(e)}")
1036
+ msg_str = str(e)
1037
+ clean_msg = msg_str # Default to full message
1038
+ if "Unsupported URL" in msg_str: clean_msg = "Unsupported video URL."
1039
+ elif "video unavailable" in msg_str.lower(): clean_msg = "Video is unavailable."
1040
+ elif "private video" in msg_str.lower(): clean_msg = "Video is private."
1041
+ elif "age restricted" in msg_str.lower(): clean_msg = "Video is age-restricted and requires login."
1042
+ elif "Sign in to confirm" in msg_str or "cookies" in msg_str.lower() or "authentication" in msg_str.lower():
1043
+ clean_msg = "Video download failed due to YouTube restrictions (e.g., sign-in, cookies, or authentication required)."
1044
+ elif "HTTP Error 403" in msg_str or "Forbidden" in msg_str : clean_msg = "Access to video denied (Forbidden/403)."
1045
+ elif "HTTP Error 404" in msg_str or "Not Found" in msg_str : clean_msg = "Video not found (404)."
1046
+ # Keep the message relatively concise for the LLM
1047
+ return f"Video download failed: {clean_msg[:250] + '...' if len(clean_msg) > 250 else clean_msg}" # Limit length of detailed message
1048
+
1049
  except Exception as e:
1050
  gaia_logger.error(f"Error during video analysis for {video_url}: {e}", exc_info=True)
1051
+ return f"An unexpected error occurred during video analysis: {type(e).__name__} - {str(e)[:100]}"
1052
  finally:
1053
+ if cap and cap.isOpened():
1054
+ cap.release()
1055
+ gaia_logger.info("Video capture released.")
1056
+ if temp_dir_obj:
1057
+ temp_dir_path_for_log = temp_dir_obj.name # Store before cleanup for logging
1058
+ try:
1059
+ temp_dir_obj.cleanup()
1060
+ gaia_logger.info(f"Successfully cleaned up temp video directory: {temp_dir_path_for_log}")
1061
+ except Exception as e_cleanup:
1062
+ gaia_logger.error(f"Error cleaning up temp video directory {temp_dir_path_for_log}: {e_cleanup}", exc_info=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
1063
 
1064
 
1065
  def _parse_llm_output(self, llm_text: str) -> Dict[str, str]:
 
1066
  reasoning_trace = ""
1067
  model_answer = ""
1068
  final_answer_sentinel = "FINAL ANSWER:"
1069
+
1070
  parts = llm_text.split(final_answer_sentinel, 1)
1071
+
1072
  if len(parts) == 2:
1073
  reasoning_trace = parts[0].strip()
1074
  model_answer = parts[1].strip()
1075
  else:
1076
+ reasoning_trace = llm_text # If sentinel not found, assume whole output is reasoning
1077
  lines = llm_text.strip().split('\n')
1078
+ # Try to take the last non-empty line as answer, or a default if all reasoning
1079
+ model_answer = "Could not parse answer" # Default if no clear answer found
1080
+ for line in reversed(lines):
1081
+ if line.strip():
1082
+ model_answer = line.strip()
1083
+ break
1084
  gaia_logger.warning(f"LLM output did not contain '{final_answer_sentinel}'. Using fallback parsing. Full LLM text: '{llm_text[:200]}...'")
1085
 
1086
  return {"model_answer": model_answer, "reasoning_trace": reasoning_trace}
1087
 
1088
  def _formulate_answer_with_llm(self, question: str, file_context: Optional[str], web_context: Optional[str]) -> Dict[str, str]:
 
1089
  default_model_answer = "Information not available in provided context"
1090
  default_reasoning = "LLM processing failed or context insufficient."
1091
 
1092
+ if not self.llm_model or not genai or not GenerationConfig or not FinishReason or not HarmCategory or not HarmBlockThreshold: # Added more checks
1093
+ gaia_logger.warning("LLM model (Gemini) or necessary enums/configs not available for answer formulation.")
1094
+ reasoning = "LLM model (Gemini) or its configuration components not available for answer formulation."
1095
  answer_val = default_model_answer
1096
+ # Provide some context indication even if LLM is down
1097
  if web_context and file_context:
1098
  reasoning += " Context from file and web was found but not processed by LLM."
1099
+ elif web_context:
1100
  reasoning += f" External context found: {web_context.splitlines()[0] if web_context.splitlines() else 'No specific snippet found.'}"
1101
  elif file_context:
1102
  reasoning += f" File context found: {file_context[:100]}..."
 
1104
  reasoning += " No context found."
1105
  return {"model_answer": answer_val, "reasoning_trace": reasoning}
1106
 
1107
+
1108
  prompt_parts = [
1109
+ "You are a general AI assistant. Your primary goal is to answer the user's question accurately and concisely based *only* on the provided context (from a document, web search results, or video analysis).",
1110
  "If the context comes from 'Video analysis result', understand that 'species types inferred' means the video was analyzed by an AI to identify birds and infer their species using visual question answering. The count refers to the maximum number of *distinct types* of birds identified in this way in any single analyzed video frame.",
1111
  "First, think step-by-step and briefly explain your reasoning based on the context. This part is for clarity and should come before your final answer.",
1112
  "After your reasoning, you MUST conclude your response with the exact phrase 'FINAL ANSWER:', followed by your answer on the same line or the next.",
 
1115
  " - If the answer is a string: use as few words as possible. Do not use articles (a, an, the) unless grammatically essential. Do not use abbreviations (e.g., write 'United States' not 'USA', 'Los Angeles' not 'LA') unless the question implies an abbreviation or it's a very common, universally understood one relevant to the context. Write digits in plain text (e.g., 'two' not '2') if they are part of a descriptive phrase, but use numerical digits if the question implies a code, identifier, version number, or a direct numerical value is more natural (e.g., 'Windows 10', 'part number 5').",
1116
  " - If the answer is a list of items: provide them as a comma-separated list (e.g., item1, item2, item3). Apply the number or string rules above to each element in the list.",
1117
  " - If the context is insufficient to answer the question: your reasoning should clearly state this, and your FINAL ANSWER should be 'Information not available in provided context'. Do not invent answers.",
1118
+ "Prioritize information from 'Enriched Content' from web search results if available and relevant over shorter 'Snippets'. Information from 'Video Analysis Context' is highly specific to video-related questions.",
1119
  "\nUser Question: ", question
1120
  ]
1121
 
 
1124
  if file_context:
1125
  file_header = "\n\nContext from Provided Document:\n---"
1126
  file_footer = "\n---"
 
1127
  len_web_ctx = len(web_context) if web_context else 0
1128
+ max_len_for_file = MAX_CONTEXT_LENGTH_LLM - current_prompt_text_len - len_web_ctx - len(file_header) - len(file_footer) - 500 # Buffer
1129
+
1130
+ if max_len_for_file > 100 :
1131
  truncated_file_context = file_context[:max_len_for_file]
1132
  if len(file_context) > len(truncated_file_context):
1133
  truncated_file_context += " ... (file context truncated)"
1134
  prompt_parts.extend([file_header, truncated_file_context, file_footer])
1135
  current_prompt_text_len += len(file_header) + len(truncated_file_context) + len(file_footer)
1136
  context_added = True
1137
+ else: gaia_logger.warning(f"Not enough space for file context in LLM prompt. Available after other parts: {MAX_CONTEXT_LENGTH_LLM - current_prompt_text_len - len_web_ctx - len(file_header) - len(file_footer)}")
1138
 
1139
 
1140
+ if web_context:
 
1141
  header_text = "\n\nContext from External Sources (Web/Video):\n---"
1142
+ if "Video analysis result:" in web_context and "Source [" not in web_context: # Only video
1143
  header_text = "\n\nContext from Video Analysis:\n---"
1144
+ elif "Source [" in web_context and "Video analysis result:" not in web_context: # Only web
1145
  header_text = "\n\nContext from Web Search Results:\n---"
1146
+ # If both, the generic "External Sources" is fine.
1147
+
1148
  web_footer = "\n---"
1149
+ available_len_for_web = MAX_CONTEXT_LENGTH_LLM - current_prompt_text_len - len(header_text) - len(web_footer) - 300
 
 
1150
 
1151
+ if available_len_for_web > 100:
1152
  truncated_web_context = web_context
1153
  if len(web_context) > available_len_for_web:
1154
  truncated_web_context = web_context[:available_len_for_web] + "\n... (external context truncated)"
1155
  gaia_logger.info(f"Truncated external (web/video) context from {len(web_context)} to {len(truncated_web_context)} chars for LLM.")
1156
  prompt_parts.extend([header_text, truncated_web_context, web_footer])
1157
+ context_added = True
1158
+ else: gaia_logger.warning(f"Not enough space for web/video context in LLM prompt. Available: {MAX_CONTEXT_LENGTH_LLM - current_prompt_text_len - len(header_text) - len(web_footer)}")
1159
 
1160
  if not context_added: prompt_parts.append("\n\nNo document, web, or video context could be provided due to length constraints or availability.")
1161
+ prompt_parts.append("\n\nReasoning and Final Answer:")
1162
  final_prompt = "\n".join(prompt_parts)
1163
+
1164
  gaia_logger.info(f"LLM Prompt (first 300 chars): {final_prompt[:300]}...")
1165
  gaia_logger.info(f"LLM Total prompt length: {len(final_prompt)} chars.")
1166
 
 
 
1167
 
1168
  try:
1169
+ gen_config = GenerationConfig(temperature=0.1, top_p=0.95, max_output_tokens=1024)
1170
+ safety_settings = [
1171
+ {"category": HarmCategory.HARM_CATEGORY_HARASSMENT, "threshold": HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE},
1172
+ {"category": HarmCategory.HARM_CATEGORY_HATE_SPEECH, "threshold": HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE},
1173
+ {"category": HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT, "threshold": HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE},
1174
+ {"category": HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT, "threshold": HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE},
1175
+ ]
1176
+ response = self.llm_model.generate_content(final_prompt, generation_config=gen_config, safety_settings=safety_settings)
1177
+
1178
+ if hasattr(response, 'prompt_feedback') and response.prompt_feedback.block_reason:
1179
+ reason_text = response.prompt_feedback.block_reason.name
1180
+ block_details = "; ".join([f"{sr.category.name}: {sr.probability.name}" for sr in response.prompt_feedback.safety_ratings if hasattr(sr, 'blocked') and sr.blocked])
1181
+ gaia_logger.warning(f"Gemini prompt blocked. Reason: {reason_text}. Details: {block_details}")
1182
+ return {"model_answer": "LLM Error: Prompt blocked", "reasoning_trace": f"My input was blocked by the LLM provider (Reason: {reason_text}). Details: {block_details}"}
1183
+
1184
+ if not response.candidates:
1185
+ gaia_logger.warning("Gemini response has no candidates.")
1186
+ return {"model_answer": "LLM Error: No response", "reasoning_trace": "LLM did not provide any response candidates."}
1187
+
1188
+ candidate = response.candidates[0]
1189
+ # Check candidate's finish_reason
1190
+ if candidate.finish_reason != FinishReason.STOP:
1191
+ reason_name = candidate.finish_reason.name if hasattr(candidate.finish_reason, 'name') else str(candidate.finish_reason)
1192
+ safety_ratings_str = ""
1193
+ if candidate.safety_ratings: # Check if safety_ratings exist
1194
+ relevant_ratings = [
1195
+ f"{sr.category.name.split('_')[-1] if hasattr(sr.category, 'name') else 'CAT?'}: {(sr.probability.name if hasattr(sr.probability, 'name') else 'PROB?')}"
1196
+ for sr in candidate.safety_ratings if (hasattr(sr,'blocked') and sr.blocked) or (hasattr(sr,'probability') and HarmProbability and sr.probability.value >= HarmProbability.MEDIUM.value)
1197
+ ]
1198
+ if relevant_ratings: safety_ratings_str = "; ".join(relevant_ratings)
1199
+
1200
+ gaia_logger.warning(f"Gemini candidate did not finish successfully. Reason: {reason_name}. Safety Ratings: {safety_ratings_str if safety_ratings_str else 'N/A'}")
1201
+
1202
+ user_message = "LLM Error: Response incomplete"
1203
+ if candidate.finish_reason == FinishReason.SAFETY: user_message = "LLM Error: Response blocked for safety"
1204
+ elif candidate.finish_reason == FinishReason.MAX_TOKENS: user_message = "LLM Error: Response truncated (max tokens)"
1205
+ elif candidate.finish_reason == FinishReason.RECITATION: user_message = "LLM Error: Response blocked (recitation)"
1206
+
1207
+ return {
1208
+ "model_answer": user_message,
1209
+ "reasoning_trace": f"LLM generation stopped. Reason: {reason_name}. " + (f"Details: {safety_ratings_str}" if safety_ratings_str else "")
1210
+ }
1211
+
1212
+ llm_answer_text = response.text # Safe to access now
1213
  gaia_logger.info(f"LLM Raw Full Answer (first 200 chars): {llm_answer_text[:200]}...")
1214
  return self._parse_llm_output(llm_answer_text)
1215
+
1216
+ except ValueError as ve:
1217
+ if "finish_reason" in str(ve).lower() and ("part" in str(ve).lower() or "candidate" in str(ve).lower()):
1218
+ gaia_logger.error(f"ValueError accessing Gemini response.text, likely due to non-STOP finish_reason not caught explicitly: {ve}", exc_info=False) # exc_info=False as it's handled
1219
+ fr_from_ex = "Unknown (from ValueError)"
1220
+ match_fr = re.search(r"finish_reason.*?is\s*(\w+)", str(ve), re.IGNORECASE) # Try to get name or number
1221
+ if match_fr: fr_from_ex = match_fr.group(1)
1222
+ return {"model_answer": "LLM Error: Invalid response state",
1223
+ "reasoning_trace": f"Could not parse LLM response. Finish reason possibly {fr_from_ex}. Details: {str(ve)[:150]}"}
1224
+ else: # Other ValueErrors
1225
+ gaia_logger.error(f"ValueError during Gemini call or processing: {ve}", exc_info=True)
1226
+ return {"model_answer": "LLM Error: Value error", "reasoning_trace": f"A value error occurred: {str(ve)}"}
1227
  except Exception as e:
1228
  gaia_logger.error(f"Error calling Gemini API: {e}", exc_info=True)
1229
  error_type_name = type(e).__name__
1230
  error_message = str(e)
1231
  reasoning = f"Error calling Gemini API: {error_type_name} - {error_message}"
1232
  answer_val = "LLM API error"
1233
+
 
 
1234
  if "API key" in error_message.lower() and ("invalid" in error_message.lower() or "not valid" in error_message.lower()):
1235
  answer_val = "LLM Auth Error"
1236
  reasoning = "LLM API key is invalid or not authorized."
 
1240
  elif "InternalServerError" in error_type_name or "500" in error_message :
1241
  answer_val = "LLM server error"
1242
  reasoning = "Error: LLM experienced an internal server error."
1243
+ # Add specific handling for google.api_core.exceptions.ServiceUnavailable (503) if it occurs
1244
+ elif "ServiceUnavailable" in error_type_name or "503" in error_message:
1245
+ answer_val = "LLM service unavailable"
1246
+ reasoning = "Error: LLM service is temporarily unavailable (503)."
1247
+
1248
 
1249
  return {"model_answer": answer_val, "reasoning_trace": reasoning}
1250
 
1251
  def __call__(self, question: str, task_id: Optional[str] = None) -> Dict[str, str]:
1252
  gaia_logger.info(f"Agent processing: '{question[:70]}...', TaskID: {task_id}")
1253
  q_lower = question.lower().strip()
1254
+
1255
  video_context_str: Optional[str] = None
1256
+ # Regex for YouTube URLs (watch, short, and youtu.be forms)
1257
+ video_url_match = re.search(r"(https?://(?:www\.)?(?:youtube\.com/(?:watch\?v=|shorts/)|youtu\.be/)[\w\-=&%]+)", question)
1258
 
1259
 
1260
+ video_keywords = ["video", "youtube.com", "youtu.be", "clip", "recording"] # Broader keywords
1261
+ species_keywords = ["species", "bird", "birds", "type of bird", "kinds of bird", "different birds"]
1262
+ action_keywords = ["count", "how many", "number of", "simultaneously", "at the same time", "on camera", "identify", "list"]
 
1263
 
1264
+ # Trigger video analysis if a URL is found AND relevant keywords are present
1265
  if video_url_match and \
1266
+ any(vk in q_lower for vk in video_keywords) and \
1267
  any(sk in q_lower for sk in species_keywords) and \
1268
  any(ak in q_lower for ak in action_keywords):
1269
  video_url = video_url_match.group(0)
 
1277
  return {"model_answer": "general AI assistant", "reasoning_trace": "User asked for my identity."}
1278
 
1279
  file_ctx_str: Optional[str] = None
1280
+ file_indicators = ["document", "file", "text provided", "attachment", "content of the file", "data in the file", "excel sheet", ".pdf", ".csv", ".txt", "audio file", "code snippet", "log file", "spreadsheet"]
1281
+ if task_id and (any(fi in q_lower for fi in file_indicators) or "this task involves a file" in q_lower or "the provided" in q_lower or "attached" in q_lower):
 
 
 
1282
  file_ctx_str = self._fetch_and_process_file_content(task_id)
1283
  if file_ctx_str: gaia_logger.info(f"Processed file context ({len(file_ctx_str)} chars) for task {task_id}")
1284
  else: gaia_logger.warning(f"No file content or failed to process for task {task_id}")
1285
+
1286
+ web_rag_ctx_str: Optional[str] = None
1287
  needs_web_rag = True
1288
+
1289
+ # Logic to decide if RAG web search is needed
1290
+ if video_context_str:
1291
+ # If video analysis seems to directly answer a counting/identification question from video
1292
+ if "Video analysis result:" in video_context_str and not "download failed" in video_context_str.lower() and not "skipped" in video_context_str.lower():
1293
+ if (("count" in q_lower or "how many" in q_lower or "number of" in q_lower) and ("simultaneously" in q_lower or "at the same time" in q_lower or "distinct" in q_lower)) and any(sk_q in q_lower for sk_q in species_keywords):
1294
+ needs_web_rag = False # Video analysis likely sufficient
1295
+ gaia_logger.info("Video context seems primary for a specific video counting question; web RAG may be skipped.")
1296
+
1297
+
1298
+ if file_ctx_str and len(file_ctx_str) > 100 and not video_context_str: # Only consider file if no video context
1299
+ # Keywords suggesting the answer is likely within the document
1300
+ doc_can_answer_kws = ["summarize", "according to the document", "in the provided text", "based on the file content", "from this file", "in this data"]
1301
+ # Keywords suggesting external info is needed despite file
1302
+ web_still_needed_kws = ["what is the current", "latest news on", "public opinion of", "search for more about", "compare this to", "what happened after"]
1303
+
 
1304
  if any(kw in q_lower for kw in doc_can_answer_kws) and not any(kw in q_lower for kw in web_still_needed_kws):
1305
  needs_web_rag = False
1306
+ gaia_logger.info("File context seems primary; web RAG may be skipped.")
1307
+ # Less strong heuristic: if it's a statement or simple file query not asking for external comparison/update
1308
+ elif not any(kw in q_lower for kw in web_still_needed_kws) and not question.strip().endswith("?"):
1309
+ if not any(qk in q_lower for qk in ["why is", "how does", "explain the impact of", "what if"]): # Questions often needing broader context
1310
  needs_web_rag = False
1311
+ gaia_logger.info("File context seems sufficient for non-complex query; web RAG may be skipped.")
1312
+
1313
 
 
1314
  if "don't search" in q_lower or "do not search" in q_lower or "without searching" in q_lower or "without using the internet" in q_lower:
1315
  needs_web_rag = False
1316
  gaia_logger.info("Web RAG explicitly disabled by user query.")
1317
+
1318
  if needs_web_rag:
1319
+ search_q = question.replace("?", "").strip()
1320
+ # If video context failed, the question might still be about the video's topic, so RAG is useful.
1321
+ # If file context is present but RAG is still needed, LLM will have to reconcile.
1322
+ rag_res = self.rag_pipeline.analyze(query=search_q, force_refresh=False)
1323
  if rag_res:
1324
  snippets = []
1325
  for i, res_item in enumerate(rag_res):
1326
  title = res_item.get('title','N/A')
1327
  body = res_item.get('body','')
1328
  href = res_item.get('href','#')
1329
+ provider_info = res_item.get('query_tag','WebSearch') # Can be refined if RAG provides more source details
1330
+ source_type = "EnrichedContent" if res_item.get('enriched') else "Snippet"
1331
+ body_preview = (body[:1500] + "...") if len(body) > 1500 else body
1332
+ snippets.append(f"Source [{i+1} - {provider_info}]: {title}\nURL: {href}\n{source_type}: {body_preview}\n---")
1333
  web_rag_ctx_str = "\n\n".join(snippets)
1334
  if web_rag_ctx_str: gaia_logger.info(f"RAG pipeline yielded web results ({len(web_rag_ctx_str)} chars).")
1335
  else: gaia_logger.warning("RAG pipeline yielded no web results for the query.")
1336
+
 
1337
  final_llm_external_context_parts = []
1338
  if video_context_str:
1339
+ final_llm_external_context_parts.append(f"{video_context_str}") # Header already in video_context_str
1340
  if web_rag_ctx_str:
1341
+ # No separate header needed if video_context_str already has "Video Analysis Context:"
1342
+ # and web_rag_ctx_str is structured with "Source [n]:"
1343
+ final_llm_external_context_parts.append(f"{web_rag_ctx_str}")
1344
+
1345
+ final_llm_external_context = "\n\n---\n\n".join(final_llm_external_context_parts).strip() if final_llm_external_context_parts else None
1346
+
1347
  agent_response_dict = self._formulate_answer_with_llm(question, file_ctx_str, final_llm_external_context)
1348
  gaia_logger.info(f"LLM-based model_answer (first 70 chars): {agent_response_dict.get('model_answer', '')[:70]}...")
1349
  return agent_response_dict
 
1350
 
1351
+
1352
+ def run_and_submit_all(profile: gr.OAuthProfile | None):
1353
  space_id = os.getenv("SPACE_ID")
1354
  if profile: username = f"{profile.username}"
1355
  else: return "Please Login to Hugging Face.", None
 
1362
  questions_data = response.json()
1363
  if not questions_data or not isinstance(questions_data, list): return "Questions list empty/invalid.", None
1364
  except Exception as e: return f"Error fetching questions: {e}", None
1365
+
1366
  results_log_for_gradio, answers_for_api_submission = [], []
1367
+ # Use a more conservative default RPM if not set, matching free tier common limits.
1368
+ GEMINI_RPM_LIMIT = int(os.getenv("GEMINI_RPM_LIMIT", "10")) # Default to 10 RPM if not set, as per common free tier
1369
+ # Add a small buffer to sleep time
1370
+ sleep_llm = (60.0 / GEMINI_RPM_LIMIT) + 0.5 if GEMINI_RPM_LIMIT > 0 else 0.2
1371
+ gaia_logger.info(f"Using Gemini RPM limit: {GEMINI_RPM_LIMIT}, LLM call sleep: {sleep_llm:.2f}s")
1372
+
1373
+
1374
  for i, item in enumerate(questions_data):
1375
  task_id, q_text = item.get("task_id"), item.get("question")
1376
  model_answer_val = "AGENT ERROR"
 
1382
  results_log_for_gradio.append({"Task ID": task_id, "Question": q_text, "Submitted Answer": model_answer_val, "Reasoning Trace": reasoning_trace_val})
1383
  answers_for_api_submission.append({"task_id": task_id, "submitted_answer": model_answer_val})
1384
  continue
1385
+
1386
  gaia_logger.info(f"Q {i+1}/{len(questions_data)} - Task: {task_id}")
1387
  try:
1388
  agent_response_dict = agent(question=q_text, task_id=task_id)
 
1392
  gaia_logger.error(f"Error during agent call for task {task_id}: {e}", exc_info=True)
1393
  model_answer_val = "AGENT EXECUTION ERROR"
1394
  reasoning_trace_val = f"Agent call failed: {type(e).__name__} - {str(e)}"
1395
+
1396
  answers_for_api_submission.append({"task_id": task_id, "submitted_answer": model_answer_val})
1397
  results_log_for_gradio.append({"Task ID": task_id, "Question": q_text, "Submitted Answer": model_answer_val, "Reasoning Trace (first 500 chars)": reasoning_trace_val[:500] + ("..." if len(reasoning_trace_val) > 500 else "")})
1398
+
1399
  if i < len(questions_data) - 1: time.sleep(sleep_llm)
1400
+
1401
  if not answers_for_api_submission: return "Agent produced no answers for API submission.", pd.DataFrame(results_log_for_gradio or [{"Info": "No questions processed"}])
1402
+
1403
  submission_payload_for_api = {
1404
+ "username": username.strip(),
1405
+ "agent_code": agent_code,
1406
+ "answers": answers_for_api_submission
1407
  }
1408
  gaia_logger.info(f"Submitting {len(answers_for_api_submission)} answers for '{username}' to API...")
1409
  gaia_logger.debug(f"API Submission Payload Sample: {json.dumps(submission_payload_for_api)[:500]}")
1410
 
1411
  try:
1412
+ response = requests.post(submit_url, json=submission_payload_for_api, timeout=60);
1413
  response.raise_for_status()
1414
  result_data = response.json()
1415
  status = (f"Submission Successful!\nUser: {result_data.get('username')}\nScore: {result_data.get('score','N/A')}% "
 
1421
  return f"Submission Failed: {err_detail}", pd.DataFrame(results_log_for_gradio)
1422
  except Exception as e: return f"Submission Failed: {e}", pd.DataFrame(results_log_for_gradio)
1423
 
1424
+ with gr.Blocks(title="GAIA RAG Agent - Advanced") as demo:
1425
+ gr.Markdown("# GAIA Level 1 Agent") # Updated Title
1426
  gr.Markdown(
1427
  """
1428
  **Instructions:**
1429
+ 1. Ensure you are logged in via the Hugging Face Login button below.
1430
+ 2. Click 'Run Evaluation & Submit All Answers' to process all questions from the GAIA benchmark and submit them.
1431
  ---
1432
+ This agent utilizes Retrieval-Augmented Generation (RAG) with multiple search providers, advanced file processing (CSV, JSON, Excel, PDF, Audio Transcription), and experimental video analysis capabilities (bird species identification/counting in YouTube videos) via Hugging Face Transformers. Answers are formulated by a Large Language Model (Google Gemini).
1433
  """
1434
  )
1435
  gr.LoginButton()
1436
  run_button = gr.Button("Run Evaluation & Submit All Answers", variant="primary")
1437
  status_output = gr.Textbox(label="Status / Submission Result", lines=5, interactive=False)
1438
+ results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True, max_rows=20, height=500) # Added max_rows and height
1439
  run_button.click(fn=run_and_submit_all, inputs=[], outputs=[status_output, results_table])
1440
 
1441
+ if __name__ == "__main__":
1442
+ print("\n" + "-"*30 + " GAIA Level 1 Agent - RAG, FileProc, Video Analysis " + "-"*30)
1443
  required_env = {
1444
+ "GOOGLE_GEMINI_API_KEY": GOOGLE_GEMINI_API_KEY,
1445
+ "GOOGLE_API_KEY": GOOGLE_CUSTOM_SEARCH_API_KEY,
1446
+ "GOOGLE_CSE_ID": GOOGLE_CUSTOM_SEARCH_CSE_ID,
1447
  "TAVILY_API_KEY": TAVILY_API_KEY,
1448
  }
1449
  missing_keys = [k for k, v in required_env.items() if not v]
1450
  for k, v in required_env.items(): print(f"✅ {k} found." if v else f"⚠️ WARNING: {k} not set.")
1451
+
 
1452
  libraries_to_check = [
1453
+ ("transformers", hf_transformers_pipeline), ("torch", torch),
1454
  ("librosa", librosa), ("openpyxl", openpyxl), ("pdfplumber", pdfplumber),
1455
  ("yt_dlp", yt_dlp), ("cv2 (opencv-python)", cv2), ("BeautifulSoup", BeautifulSoup),
1456
  ("duckduckgo_search", DDGS), ("googleapiclient", build_google_search_service),
 
1461
 
1462
  if missing_keys: print(f"\n--- PLEASE SET MISSING ENV VARS FOR FULL FUNCTIONALITY: {', '.join(missing_keys)} ---\n")
1463
  else: print("\n--- All major API Key Environment Variables found. ---")
1464
+
1465
+ # Log the Gemini RPM limit being used
1466
+ gemini_rpm = os.getenv("GEMINI_RPM_LIMIT", "10 (defaulted)")
1467
+ print(f"--- Using GEMINI_RPM_LIMIT: {gemini_rpm} (Ensure this matches your Gemini API plan limits) ---")
1468
+
1469
+
1470
  print("-"*(60 + len(" GAIA Level 1 Agent - RAG, FileProc, Video Analysis ")) + "\n")
1471
  demo.launch(server_name="0.0.0.0", server_port=7860, debug=False, share=False)