from typing import Dict, List, Tuple, Optional import json import sys import glob from pathlib import Path from collections import defaultdict def get_latest_log() -> str: """Find the most recently modified log file in the current directory. Returns: str: Path to the most recently modified log file Raises: SystemExit: If no log files are found in current directory """ log_pattern = "api_usage_*.json" logs = list(Path(".").glob(log_pattern)) if not logs: print(f"No files matching pattern '{log_pattern}' found in current directory") sys.exit(1) return str(max(logs, key=lambda p: p.stat().st_mtime)) def analyze_log_file(filename: str) -> Tuple[List[Dict], List[Dict], Dict[str, List[str]]]: """Analyze a log file for entries missing images and errors. Args: filename: Path to the log file to analyze Returns: Tuple containing: - List of entries with no images - List of skipped/error entries - Dict of processing errors by type Raises: SystemExit: If file cannot be found or read """ no_images = [] errors = defaultdict(list) skipped = [] try: with open(filename, "r") as f: for line_num, line in enumerate(f, 1): # Skip HTTP request logs if line.startswith("HTTP Request:") or line.strip() == "": continue try: # Try to parse the JSON line if not line.strip().startswith("{"): continue entry = json.loads(line.strip()) case_id = entry.get("case_id") question_id = entry.get("question_id") # Skip if we can't identify the question if not case_id or not question_id: continue # Check for explicit skip/error status if entry.get("status") in ["skipped", "error"]: skipped.append( { "case_id": case_id, "question_id": question_id, "reason": entry.get("reason"), "status": entry.get("status"), } ) continue # Check user content for images messages = entry.get("input", {}).get("messages", []) has_image = False for msg in messages: content = msg.get("content", []) if isinstance(content, list): for item in content: if isinstance(item, dict) and item.get("type") == "image_url": has_image = True break if not has_image: no_images.append( { "case_id": case_id, "question_id": question_id, "question": entry.get("input", {}) .get("question_data", {}) .get("question", "")[:100] + "...", # First 100 chars of question } ) except json.JSONDecodeError: errors["json_decode"].append(f"Line {line_num}: Invalid JSON") continue except Exception as e: errors["other"].append(f"Line {line_num}: Error processing entry: {str(e)}") except FileNotFoundError: print(f"Error: Could not find log file: {filename}") sys.exit(1) except Exception as e: print(f"Error reading file {filename}: {str(e)}") sys.exit(1) return no_images, skipped, errors def print_results( filename: str, no_images: List[Dict], skipped: List[Dict], errors: Dict[str, List[str]] ) -> None: """Print analysis results. Args: filename: Name of the analyzed log file no_images: List of entries with no images skipped: List of skipped/error entries errors: Dict of processing errors by type """ print(f"\nAnalyzing log file: {filename}") print("\n=== Questions with No Images ===") if no_images: for entry in no_images: print(f"\nCase ID: {entry['case_id']}") print(f"Question ID: {entry['question_id']}") print(f"Question Preview: {entry['question']}") print(f"\nTotal questions without images: {len(no_images)}") print("\n=== Skipped/Error Questions ===") if skipped: for entry in skipped: print(f"\nCase ID: {entry['case_id']}") print(f"Question ID: {entry['question_id']}") print(f"Status: {entry['status']}") print(f"Reason: {entry.get('reason', 'unknown')}") print(f"\nTotal skipped/error questions: {len(skipped)}") if errors: print("\n=== Processing Errors ===") for error_type, messages in errors.items(): if messages: print(f"\n{error_type}:") for msg in messages: print(f" {msg}") def main() -> None: """Main entry point for log validation script.""" # If a file is specified as an argument, use it; otherwise find the latest log if len(sys.argv) > 1: log_file = sys.argv[1] else: log_file = get_latest_log() no_images, skipped, errors = analyze_log_file(log_file) print_results(log_file, no_images, skipped, errors) if __name__ == "__main__": main()