File size: 5,890 Bytes
d7a7846
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
from typing import Dict, List, Tuple, Optional
import json
import sys
import glob
from pathlib import Path
from collections import defaultdict


def get_latest_log() -> str:
    """Find the most recently modified log file in the current directory.

    Returns:
        str: Path to the most recently modified log file

    Raises:
        SystemExit: If no log files are found in current directory
    """
    log_pattern = "api_usage_*.json"
    logs = list(Path(".").glob(log_pattern))
    if not logs:
        print(f"No files matching pattern '{log_pattern}' found in current directory")
        sys.exit(1)
    return str(max(logs, key=lambda p: p.stat().st_mtime))


def analyze_log_file(filename: str) -> Tuple[List[Dict], List[Dict], Dict[str, List[str]]]:
    """Analyze a log file for entries missing images and errors.

    Args:
        filename: Path to the log file to analyze

    Returns:
        Tuple containing:
            - List of entries with no images
            - List of skipped/error entries
            - Dict of processing errors by type

    Raises:
        SystemExit: If file cannot be found or read
    """
    no_images = []
    errors = defaultdict(list)
    skipped = []

    try:
        with open(filename, "r") as f:
            for line_num, line in enumerate(f, 1):
                # Skip HTTP request logs
                if line.startswith("HTTP Request:") or line.strip() == "":
                    continue
                try:
                    # Try to parse the JSON line
                    if not line.strip().startswith("{"):
                        continue
                    entry = json.loads(line.strip())
                    case_id = entry.get("case_id")
                    question_id = entry.get("question_id")

                    # Skip if we can't identify the question
                    if not case_id or not question_id:
                        continue

                    # Check for explicit skip/error status
                    if entry.get("status") in ["skipped", "error"]:
                        skipped.append(
                            {
                                "case_id": case_id,
                                "question_id": question_id,
                                "reason": entry.get("reason"),
                                "status": entry.get("status"),
                            }
                        )
                        continue

                    # Check user content for images
                    messages = entry.get("input", {}).get("messages", [])
                    has_image = False
                    for msg in messages:
                        content = msg.get("content", [])
                        if isinstance(content, list):
                            for item in content:
                                if isinstance(item, dict) and item.get("type") == "image_url":
                                    has_image = True
                                    break
                    if not has_image:
                        no_images.append(
                            {
                                "case_id": case_id,
                                "question_id": question_id,
                                "question": entry.get("input", {})
                                .get("question_data", {})
                                .get("question", "")[:100]
                                + "...",  # First 100 chars of question
                            }
                        )
                except json.JSONDecodeError:
                    errors["json_decode"].append(f"Line {line_num}: Invalid JSON")
                    continue
                except Exception as e:
                    errors["other"].append(f"Line {line_num}: Error processing entry: {str(e)}")
    except FileNotFoundError:
        print(f"Error: Could not find log file: {filename}")
        sys.exit(1)
    except Exception as e:
        print(f"Error reading file {filename}: {str(e)}")
        sys.exit(1)

    return no_images, skipped, errors


def print_results(
    filename: str, no_images: List[Dict], skipped: List[Dict], errors: Dict[str, List[str]]
) -> None:
    """Print analysis results.

    Args:
        filename: Name of the analyzed log file
        no_images: List of entries with no images
        skipped: List of skipped/error entries
        errors: Dict of processing errors by type
    """
    print(f"\nAnalyzing log file: {filename}")
    print("\n=== Questions with No Images ===")
    if no_images:
        for entry in no_images:
            print(f"\nCase ID: {entry['case_id']}")
            print(f"Question ID: {entry['question_id']}")
            print(f"Question Preview: {entry['question']}")
    print(f"\nTotal questions without images: {len(no_images)}")

    print("\n=== Skipped/Error Questions ===")
    if skipped:
        for entry in skipped:
            print(f"\nCase ID: {entry['case_id']}")
            print(f"Question ID: {entry['question_id']}")
            print(f"Status: {entry['status']}")
            print(f"Reason: {entry.get('reason', 'unknown')}")
    print(f"\nTotal skipped/error questions: {len(skipped)}")

    if errors:
        print("\n=== Processing Errors ===")
        for error_type, messages in errors.items():
            if messages:
                print(f"\n{error_type}:")
                for msg in messages:
                    print(f"  {msg}")


def main() -> None:
    """Main entry point for log validation script."""
    # If a file is specified as an argument, use it; otherwise find the latest log
    if len(sys.argv) > 1:
        log_file = sys.argv[1]
    else:
        log_file = get_latest_log()

    no_images, skipped, errors = analyze_log_file(log_file)
    print_results(log_file, no_images, skipped, errors)


if __name__ == "__main__":
    main()