Spaces:

fal
/

multihead_cls

No application file

App Files Files Community

cloneofsimo commited on 8 days ago

Commit

b72fefd

verified ·

1 Parent(s): 7e0c0e8

Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

example.py +203 -0
stage_1.py +199 -0
stage_2.py +292 -0
stage_4.py +506 -0
streamlit_evaluation_app.py +695 -0
upload_to_hf.py +112 -0
validation_runner.py +286 -0

example.py ADDED Viewed

	@@ -0,0 +1,203 @@

+#!/usr/bin/env python3
+"""
+Example inference script for the Multi-Head SigLIP2 Classifier from Hugging Face Hub.
+Usage examples:
+  # Multiple images, single text
+  python example.py --image img1.png --image img2.jpg --repo fal/multihead_cls --text "an example caption"
+  # N images, N texts (returns an N x N similarity matrix)
+  python example.py \
+    --image img1.png --image img2.jpg \
+    --text "a cat" --text "a dog" --repo fal/multihead_cls
+Requires: torch, transformers, huggingface_hub, Pillow, click
+"""
+import json
+import click
+import torch
+from PIL import Image
+from transformers import AutoProcessor
+from huggingface_hub import hf_hub_download
+# Local model definition replicated from training for easy inference
+import torch.nn as nn
+from transformers import SiglipModel
+import torch.nn.functional as F
+CKPT = "google/siglip-base-patch16-256"
+class MultiHeadSiglipClassifier(nn.Module):
+    """Dynamic multi-head classifier based on task configuration"""
+    def __init__(self, task_config: dict, model_name: str = CKPT):
+        super().__init__()
+        self.task_config = task_config
+        self.siglip = SiglipModel.from_pretrained(model_name)
+        # Freeze SigLIP parameters
+        for param in self.siglip.parameters():
+            param.requires_grad = False
+        # Create classification heads dynamically based on task config
+        hidden_size = self.siglip.config.vision_config.hidden_size
+        self.classification_heads = nn.ModuleDict()
+        for task in task_config['tasks']:
+            task_key = task['key']
+            num_classes = len(task['labels'])
+            # Create linear layer for this task
+            head = nn.Linear(hidden_size, num_classes)
+            self.classification_heads[task_key] = head
+    def forward(self, pixel_values):
+        # Get SigLIP image embeddings only
+        combined_embeds = self.siglip.get_image_features(pixel_values=pixel_values)
+        # Apply all classification heads
+        outputs = {}
+        for task_key, head in self.classification_heads.items():
+            outputs[task_key] = head(combined_embeds)
+        return outputs
+def load_model_from_hf(repo_id: str):
+    """Load model, processor, and task config from Hugging Face Hub"""
+    # Download task configuration
+    try:
+        task_config_path = hf_hub_download(repo_id=repo_id, filename="task_config.json", repo_type="model")
+        with open(task_config_path, 'r') as f:
+            task_config = json.load(f)
+    except Exception as e:
+        raise RuntimeError(f"Could not load task_config.json from {repo_id}: {e}")
+    # Load processor
+    processor = AutoProcessor.from_pretrained(CKPT)
+    # Create model with task config
+    model = MultiHeadSiglipClassifier(task_config)
+    # Load trained weights
+    try:
+        ckpt_path = hf_hub_download(repo_id=repo_id, filename="model.pth", repo_type="model")
+        state_dict = torch.load(ckpt_path, map_location="cpu")
+        model.load_state_dict(state_dict)
+    except Exception as e:
+        raise RuntimeError(f"Could not load model.pth from {repo_id}: {e}")
+    model.eval()
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    return model, processor, device, task_config
+def predict_batch(model, processor, device, task_config, image_paths, texts: list[str] | None = None):
+    """Run predictions on a batch of images using dynamic task configuration"""
+    images = [Image.open(p).convert("RGB") for p in image_paths]
+    if texts is not None and len(texts) == 0:
+        texts = None
+    # Process images
+    image_inputs = processor(images=images, return_tensors="pt")
+    pixel_values = image_inputs["pixel_values"].to(device)
+    with torch.no_grad():
+        outputs = model(pixel_values)
+        # Compute image embeddings for similarity
+        image_embeds = model.siglip.get_image_features(pixel_values=pixel_values)
+        image_embeds = F.normalize(image_embeds, p=2, dim=-1)
+    # Prepare text inputs if provided
+    text_embeds = None
+    input_ids = None
+    attention_mask = None
+    if texts is not None:
+        text_inputs = processor(text=texts, padding="max_length", return_tensors="pt")
+        input_ids = text_inputs["input_ids"].to(device)
+        attention_mask = text_inputs.get("attention_mask")
+        attention_mask = attention_mask.to(device) if attention_mask is not None else None
+        text_embeds = model.siglip.get_text_features(input_ids=input_ids, attention_mask=attention_mask)
+        text_embeds = F.normalize(text_embeds, p=2, dim=-1)
+    # Create task mappings
+    tasks = {task['key']: task for task in task_config['tasks']}
+    batch_results = []
+    batch_size = pixel_values.shape[0]
+    for i in range(batch_size):
+        item = {"image": str(image_paths[i])}
+        # Process each task dynamically
+        for task_key, task_info in tasks.items():
+            logits = outputs[task_key][i]
+            probs = torch.softmax(logits, dim=0)
+            pred_idx = torch.argmax(probs).item()
+            if task_info['type'] == 'binary':
+                # Binary classification
+                item[f"{task_key}_prediction"] = task_info['labels'][pred_idx]
+                item[f"{task_key}_confidence"] = float(probs[pred_idx].item())
+                item[f"{task_key}_prob_yes"] = float(probs[1].item()) if len(task_info['labels']) > 1 else 0.0
+                item[f"{task_key}_prob_no"] = float(probs[0].item())
+            elif task_info['type'] == 'multi_class':
+                # Multi-class classification
+                item[f"{task_key}_prediction"] = task_info['labels'][pred_idx]
+                item[f"{task_key}_confidence"] = float(probs[pred_idx].item())
+                # Add probabilities for all classes
+                for idx, label in enumerate(task_info['labels']):
+                    item[f"{task_key}_prob_{label}"] = float(probs[idx].item())
+        batch_results.append(item)
+    cosine_matrix = None
+    if input_ids is not None:
+        # These embeds are already L2-normalized inside SigLIP forward
+        cosine = torch.matmul(image_embeds, text_embeds.T)
+        cosine_matrix = cosine.cpu().tolist()
+    return {
+        "images": [str(p) for p in image_paths],
+        "texts": texts or [],
+        "task_config": task_config,
+        "predictions": batch_results,
+        "cosine_similarity": cosine_matrix,
+    }
+@click.command()
+@click.option("--image", "images", multiple=True, type=click.Path(exists=True, dir_okay=False, readable=True), help="Path(s) to image file(s). Can be passed multiple times.")
+@click.option("--repo", default="fal/multihead_cls", show_default=True, help="Hugging Face repo id with model checkpoint.")
+@click.option("--text", "texts", multiple=True, help="Text prompt(s). Can be passed multiple times to build an N x N image-text similarity matrix.")
+@click.option("--show-tasks", is_flag=True, help="Show available classification tasks and exit.")
+def cli(images, repo, texts, show_tasks):
+    """Multi-head SigLIP2 classifier inference from Hugging Face Hub"""
+    # Load model and task config
+    model, processor, device, task_config = load_model_from_hf(repo)
+    if show_tasks:
+        click.echo("Available classification tasks:")
+        for i, task in enumerate(task_config['tasks'], 1):
+            click.echo(f"  {i}. {task['name']} ({task['key']})")
+            click.echo(f"     Type: {task['type']}")
+            click.echo(f"     Labels: {', '.join(task['labels'])}")
+            click.echo(f"     Description: {task['description']}")
+            click.echo()
+        return
+    if not images:
+        images = ("img.png",)
+    results = predict_batch(model, processor, device, task_config, list(images), texts=list(texts) if texts else None)
+    click.echo(json.dumps(results, indent=2))
+if __name__ == "__main__":
+    cli()

stage_1.py ADDED Viewed

	@@ -0,0 +1,199 @@

+#!/usr/bin/env python3
+"""
+Stage 1: Data Loading and Image Downloading
+Downloads and preprocesses top 2000 images from parquet file
+"""
+import os
+import json
+import requests
+import pandas as pd
+from PIL import Image
+from io import BytesIO
+import concurrent.futures
+from pathlib import Path
+import time
+import logging
+import numpy as np
+from typing import Tuple
+# Set up logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+def setup_environment():
+    """Setup data directory"""
+    os.makedirs('./data', exist_ok=True)
+    os.makedirs('./data/images', exist_ok=True)
+    os.makedirs('./data/metadata', exist_ok=True)
+    return True
+def load_and_sample_data(parquet_path: str, n_samples: int = 2000) -> pd.DataFrame:
+    """Load parquet file and sample top N rows"""
+    logger.info(f"Loading data from {parquet_path}")
+    df = pd.read_parquet(parquet_path)
+    logger.info(f"Loaded {len(df)} rows, sampling top {n_samples}")
+    return df.head(n_samples)
+def has_white_edges(img: Image.Image, threshold: int = 240) -> bool:
+    """Check if image has 3 or more white edges (mean RGB > threshold)"""
+    try:
+        img_array = np.array(img)
+        height, width = img_array.shape[:2]
+        # Define edge thickness (check 5 pixels from each edge)
+        edge_thickness = 5
+        # Get edges
+        top_edge = img_array[:edge_thickness, :].mean(axis=(0, 1))
+        bottom_edge = img_array[-edge_thickness:, :].mean(axis=(0, 1))
+        left_edge = img_array[:, :edge_thickness].mean(axis=(0, 1))
+        right_edge = img_array[:, -edge_thickness:].mean(axis=(0, 1))
+        # Check if edge is white (all RGB channels > threshold)
+        edges = [top_edge, bottom_edge, left_edge, right_edge]
+        white_edges = sum(1 for edge in edges if np.all(edge > threshold))
+        return white_edges >= 3
+    except Exception as e:
+        logger.debug(f"Error checking white edges: {e}")
+        return False
+def download_and_process_image(url: str, target_size: int = 256) -> Image.Image:
+    """Download image and resize with center crop, skip if has white edges"""
+    try:
+        response = requests.get(url, timeout=10, headers={'User-Agent': 'Mozilla/5.0'})
+        response.raise_for_status()
+        img = Image.open(BytesIO(response.content)).convert('RGB')
+        # Check for white edges before processing
+        if has_white_edges(img):
+            logger.debug(f"Skipping image with white edges: {url}")
+            return None
+        # Resize and center crop to target_size x target_size
+        width, height = img.size
+        min_side = min(width, height)
+        scale = target_size / min_side
+        new_width = int(width * scale)
+        new_height = int(height * scale)
+        img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
+        # Center crop
+        left = (new_width - target_size) // 2
+        top = (new_height - target_size) // 2
+        right = left + target_size
+        bottom = top + target_size
+        img = img.crop((left, top, right, bottom))
+        # Double-check after processing
+        if has_white_edges(img):
+            logger.debug(f"Skipping processed image with white edges: {url}")
+            return None
+        return img
+    except Exception as e:
+        logger.error(f"Error downloading {url}: {e}")
+        return None
+def process_single_image(args: Tuple[int, str, str, str]) -> bool:
+    """Download and save a single image"""
+    idx, url, hash_val, caption = args
+    try:
+        # Download and process image
+        image = download_and_process_image(url)
+        if image is None:
+            logger.debug(f"Skipped image {idx} (white edges or download error)")
+            return False
+        # Save image
+        image_path = f'./data/images/img_{idx}.png'
+        image.save(image_path)
+        # Save metadata for next stage
+        metadata = {
+            "idx": idx,
+            "caption": caption,
+            "url": url,
+            "hash": hash_val,
+            "image_path": image_path
+        }
+        metadata_path = f'./data/metadata/meta_{idx}.json'
+        with open(metadata_path, 'w') as f:
+            json.dump(metadata, f, indent=2)
+        logger.info(f"Downloaded and saved image {idx}")
+        return True
+    except Exception as e:
+        logger.error(f"Error processing image {idx}: {e}")
+        return False
+def download_images(df: pd.DataFrame, max_workers: int = 20):
+    """Download all images with parallel processing"""
+    logger.info(f"Starting image download with {max_workers} workers...")
+    args_list = [(i, row['url'], row['hash'], row['caption'])
+                 for i, (_, row) in enumerate(df.iterrows())]
+    successful = 0
+    skipped_white_edges = 0
+    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = [executor.submit(process_single_image, args) for args in args_list]
+        for i, future in enumerate(concurrent.futures.as_completed(futures)):
+            if future.result():
+                successful += 1
+            else:
+                skipped_white_edges += 1
+            # Progress logging every 100 images
+            if (i + 1) % 100 == 0:
+                logger.info(f"Processed {i + 1}/{len(args_list)} images (successful: {successful}, skipped: {skipped_white_edges})")
+            # Minimal rate limiting for high concurrency
+            time.sleep(0.01)
+    logger.info(f"Download complete: {successful}/{len(args_list)} images downloaded, {skipped_white_edges} skipped (white edges)")
+    # Save summary
+    summary = {
+        "total_images": len(args_list),
+        "successful_downloads": successful,
+        "skipped_white_edges": skipped_white_edges,
+        "download_rate": f"{successful/len(args_list)*100:.1f}%",
+        "stage": "download_complete"
+    }
+    with open('./data/stage1_summary.json', 'w') as f:
+        json.dump(summary, f, indent=2)
+def main():
+    """Main execution for Stage 1"""
+    logger.info("Starting Stage 1: Data Loading and Image Downloading...")
+    # Setup
+    setup_environment()
+    # Load data
+    parquet_path = '/home/fal/partiprompt_clip/curated_part_00000.parquet'
+    df = load_and_sample_data(parquet_path, n_samples=5000)
+    # Save the dataframe for other stages
+    df.to_pickle('./data/sampled_data.pkl')
+    # Download images with optimized settings
+    download_images(df, max_workers=30)
+    logger.info("Stage 1 completed successfully!")
+if __name__ == "__main__":
+    main()

stage_2.py ADDED Viewed

	@@ -0,0 +1,292 @@

+#!/usr/bin/env python3
+"""
+Stage 2: Gemini Vision Classification
+Classifies images using Google Gemini with 5 classification tasks
+"""
+import os
+import json
+from PIL import Image
+from io import BytesIO
+import concurrent.futures
+from pathlib import Path
+import time
+import logging
+from typing import Dict, Any
+import mimetypes
+import random
+# Gemini SDK
+from google import genai
+from google.genai.errors import ServerError
+from google.genai.types import (
+    Blob, Part, Content, GenerateContentConfig,
+)
+# Set up logging
+logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+GEMINI_API_KEY_FALLBACK = "AIzaSyBCgkB2nRaRNgbl06MBu1I_xHiuXSUQMHA"
+def check_api_key():
+    """Ensure Google API key is set for Gemini client."""
+    if not os.getenv('GOOGLE_API_KEY'):
+        # Use provided key as fallback if env not set
+        os.environ['GOOGLE_API_KEY'] = GEMINI_API_KEY_FALLBACK
+    return True
+def _guess_mime_type(image_path: str) -> str:
+    guessed, _ = mimetypes.guess_type(image_path)
+    if guessed:
+        return guessed
+    try:
+        with Image.open(image_path) as im:
+            fmt = (im.format or '').lower()
+            if fmt in ('jpeg', 'jpg'):
+                return 'image/jpeg'
+            if fmt == 'png':
+                return 'image/png'
+            if fmt == 'webp':
+                return 'image/webp'
+            if fmt == 'gif':
+                return 'image/gif'
+    except Exception:
+        pass
+    return 'application/octet-stream'
+def _gemini_call_with_retry(contents, cfg, max_attempts: int = 5):
+    """Call Gemini with retries on server/errors."""
+    api_key = os.getenv('GOOGLE_API_KEY') or GEMINI_API_KEY_FALLBACK
+    for attempt in range(max_attempts):
+        try:
+            client = genai.Client(api_key=api_key)
+            return client.models.generate_content(
+                model="models/gemini-2.5-flash",
+                contents=contents,
+                config=cfg,
+            )
+        except ServerError as e:
+            sleep_s = (2 ** attempt) + random.random()
+            logger.warning(f"Gemini server error attempt {attempt+1}/{max_attempts}: {e}; retrying in {sleep_s:.1f}s")
+            time.sleep(sleep_s)
+        except Exception as e:
+            sleep_s = (2 ** attempt) + random.random()
+            logger.warning(f"Gemini error attempt {attempt+1}/{max_attempts}: {e}; retrying in {sleep_s:.1f}s")
+            time.sleep(sleep_s)
+    raise RuntimeError(f"Persistent Gemini error after {max_attempts} tries")
+def classify_image_with_gemini(image_path: str, caption: str, max_retries: int = 3) -> Dict[str, Any]:
+    """Use Google Gemini to classify an image with structured JSON output."""
+    prompt = f"""
+    Analyze this image with caption: "{caption}"
+    Please answer the following 5 classification questions and respond ONLY with valid JSON:
+    1. Overall Description.
+    2. Is the image product display / low quality advertisement / e-commerce product? Answer: "yes" or "no"
+    3. Is the image computer screenshot with many text overlays? Answer: "yes" or "no"
+    4. In what category is the image? Choose one from: "animals", "artifacts", "people", "outdoor_scenes", "illustrations", "vehicles", "food_and_beverage", "arts", "abstract", "produce_and_plants", "indoor_scenes"
+    5. Would you say the image is interesting? Answer: "yes" or "no"
+    6. Do you think the photo/image was made by a professional photographer? Answer: "yes" or "no"
+    IMPORTANT: Respond with ONLY a valid JSON object with these exact keys. Do not include any other text or explanation:
+    {{
+        "overall_description": "...",
+        "is_product_advertisement": "yes",
+        "is_screenshot_with_text": "no",
+        "category": "animals",
+        "is_interesting": "no",
+        "is_professional": "yes"
+    }}
+    """
+    default_response = {
+        "overall_description": "...",
+        "is_product_advertisement": "...",
+        "is_screenshot_with_text": "...",
+        "category": "...",
+        "is_interesting": "...",
+        "is_professional": "..."
+    }
+    try:
+        with open(image_path, 'rb') as f:
+            image_bytes = f.read()
+        mime_type = _guess_mime_type(image_path)
+        image_blob = Blob(mime_type=mime_type, data=image_bytes)
+        user_content = Content(
+            role="user",
+            parts=[
+                Part(text=prompt),
+                Part(inline_data=image_blob),
+            ],
+        )
+        contents = [user_content]
+        cfg = GenerateContentConfig(max_output_tokens=2500, temperature=0)
+        resp = _gemini_call_with_retry(contents, cfg, max_attempts=max_retries)
+        logger.debug(f"Gemini response type: {type(resp)}")
+        # Detailed debugging of response structure
+        logger.debug(f"Response.text: {getattr(resp, 'text', 'NO_TEXT_ATTR')}")
+        logger.debug(f"Response.candidates: {getattr(resp, 'candidates', 'NO_CANDIDATES_ATTR')}")
+        if hasattr(resp, 'candidates') and resp.candidates:
+            logger.debug(f"Number of candidates: {len(resp.candidates)}")
+            for i, candidate in enumerate(resp.candidates):
+                logger.debug(f"Candidate {i}: {candidate}")
+                if hasattr(candidate, 'content'):
+                    logger.debug(f"Candidate {i} content: {candidate.content}")
+                    if hasattr(candidate.content, 'parts'):
+                        logger.debug(f"Candidate {i} parts: {candidate.content.parts}")
+        # Check for prompt_feedback which might indicate filtering
+        if hasattr(resp, 'prompt_feedback'):
+            logger.debug(f"Prompt feedback: {resp.prompt_feedback}")
+        # Extract text from Gemini response
+        content_text = ""
+        try:
+            # Try the .text property first
+            if hasattr(resp, 'text') and resp.text:
+                content_text = resp.text
+                logger.debug(f"Got text from .text property: {content_text[:100]}...")
+            else:
+                # Fallback: extract from candidates
+                if resp.candidates and len(resp.candidates) > 0:
+                    candidate = resp.candidates[0]
+                    if hasattr(candidate, 'content') and candidate.content:
+                        if hasattr(candidate.content, 'parts') and candidate.content.parts:
+                            for part in candidate.content.parts:
+                                if hasattr(part, 'text') and part.text:
+                                    content_text += part.text
+                                    logger.debug(f"Got text from candidate part: {part.text[:100]}...")
+        except Exception as e:
+            logger.error(f"Error extracting text from Gemini response: {e}")
+            raise e
+        if not content_text:
+            logger.error(f"Empty response from Gemini")
+            return default_response
+        content_text = content_text.strip()
+        start_idx = content_text.find('{')
+        end_idx = content_text.rfind('}') + 1
+        if start_idx == -1 or end_idx == 0:
+            logger.error(f"No JSON found in response: {content_text}")
+            return default_response
+        json_content = content_text[start_idx:end_idx]
+        classification = json.loads(json_content)
+        required_keys = [
+            "overall_description",
+            "is_product_advertisement",
+            "is_screenshot_with_text",
+            "category",
+            "is_interesting",
+            "is_professional",
+        ]
+        missing_keys = [key for key in required_keys if key not in classification]
+        if missing_keys:
+            logger.warning(f"Missing keys in classification: {missing_keys}")
+            for key in missing_keys:
+                classification[key] = default_response[key]
+        return classification
+    except json.JSONDecodeError as e:
+        logger.error(f"JSON parsing error: {e}")
+        return default_response
+    except Exception as e:
+        logger.error(f"Gemini classification error: {e}")
+        return default_response
+def classify_single_image(metadata_file: Path) -> bool:
+    """Classify a single image and save results"""
+    try:
+        # Load metadata
+        with open(metadata_file, 'r', encoding='utf-8') as f:
+            metadata = json.load(f)
+        idx = metadata['idx']
+        image_path = metadata['image_path']
+        caption = metadata['caption']
+        # Check if image exists
+        if not os.path.exists(image_path):
+            logger.error(f"Image not found: {image_path}")
+            return False
+        # Classify with Gemini
+        classification = classify_image_with_gemini(image_path, caption)
+        # Add classification to metadata
+        metadata['classification'] = classification
+        metadata['stage2_complete'] = True
+        # Save updated metadata
+        new_metadata_file = metadata_file.with_name(f'meta_{idx}_stage2.json')
+        with open(new_metadata_file, 'w', encoding='utf-8') as f:
+            json.dump(metadata, f, indent=2, ensure_ascii=False)
+        logger.info(f"Classified image {idx}")
+        return True
+    except Exception as e:
+        logger.error(f"Error classifying {metadata_file}: {e}")
+        return False
+def classify_all_images(max_workers: int = 2):
+    """Classify all downloaded images with parallel processing"""
+    logger.info("Starting image classification...")
+    # Get all metadata files
+    metadata_dir = Path('./data/metadata')
+    metadata_files = list(metadata_dir.glob('meta_*.json'))
+    if not metadata_files:
+        logger.error("No metadata files found. Run stage 1 first.")
+        return
+    successful = 0
+    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = [executor.submit(classify_single_image, meta_file) for meta_file in metadata_files]
+        for future in concurrent.futures.as_completed(futures):
+            if future.result():
+                successful += 1
+            # Rate limiting for API calls
+            time.sleep(1.0)  # 1 second between API calls to avoid rate limits
+    logger.info(f"Successfully classified {successful}/{len(metadata_files)} images")
+    # Save summary
+    summary = {
+        "total_images": len(metadata_files),
+        "successful_classifications": successful,
+        "stage": "classification_complete"
+    }
+    with open('./data/stage2_summary.json', 'w') as f:
+        json.dump(summary, f, indent=2)
+def main():
+    """Main execution for Stage 2"""
+    logger.info("Starting Stage 2: Gemini Vision Classification...")
+    # Check API key
+    if not check_api_key():
+        return
+    # Classify images
+    classify_all_images(max_workers=64)  # Reduced to avoid rate limits
+    logger.info("Stage 2 completed successfully!")
+if __name__ == "__main__":
+    main()

stage_4.py ADDED Viewed

	@@ -0,0 +1,506 @@

+#!/usr/bin/env python3
+"""
+Stage 4: SigLIP v2 Multi-Head Classifier Training
+Trains a SigLIP v2-based multi-head classifier on pseudo-labeled data
+"""
+import os
+import json
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import Dataset, DataLoader
+from transformers import SiglipModel, AutoProcessor
+import numpy as np
+from PIL import Image
+from pathlib import Path
+import logging
+from typing import Dict, List, Any
+import pickle
+import matplotlib.pyplot as plt
+from torch.optim.lr_scheduler import LambdaLR
+# Set up logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+CKPT = "google/siglip-base-patch16-256"
+def load_task_config(config_path: str = './task_config.json'):
+    """Load task configuration from JSON file"""
+    if not os.path.exists(config_path):
+        raise FileNotFoundError(f"Task configuration not found: {config_path}")
+    with open(config_path, 'r') as f:
+        config = json.load(f)
+    logger.info(f"Loaded task configuration with {len(config['tasks'])} tasks")
+    return config
+class MultiHeadDataset(Dataset):
+    """Dataset for multi-head classification with configurable tasks"""
+    def __init__(self, data_dir: str, processor, task_config: Dict):
+        self.data_dir = Path(data_dir)
+        self.processor = processor
+        self.task_config = task_config
+        # Load all metadata files from stage 2 (with _stage2 suffix)
+        metadata_dir = self.data_dir / 'metadata'
+        if not metadata_dir.exists():
+            raise FileNotFoundError("Metadata directory not found. Run stages 1 and 2 first.")
+        metadata_files = list(metadata_dir.glob('meta_*_stage2.json'))
+        if not metadata_files:
+            raise FileNotFoundError("No stage 2 metadata files found. Run stage 2 first.")
+        # Load all samples
+        self.samples = []
+        skipped_incomplete = 0
+        for meta_file in metadata_files:
+            try:
+                with open(meta_file, 'r') as f:
+                    metadata = json.load(f)
+                # Check if classification is complete
+                if not metadata.get('stage2_complete', False):
+                    logger.warning(f"Skipping {meta_file} - classification not complete")
+                    skipped_incomplete += 1
+                    continue
+                # Check if classification contains incomplete data (empty or "..." values)
+                classification = metadata.get('classification', {})
+                if not classification or self._is_incomplete_classification(classification):
+                    logger.warning(f"Skipping {meta_file} - incomplete classification data")
+                    skipped_incomplete += 1
+                    continue
+                # Check if image exists
+                image_path = metadata['image_path']
+                if not os.path.exists(image_path):
+                    logger.warning(f"Image not found: {image_path}")
+                    skipped_incomplete += 1
+                    continue
+                self.samples.append(metadata)
+            except Exception as e:
+                logger.error(f"Error loading {meta_file}: {e}")
+                skipped_incomplete += 1
+        # Create label mappings from task config
+        self.label_mappings = {}
+        for task in self.task_config['tasks']:
+            if task['type'] == 'multi_class':
+                self.label_mappings[task['key']] = {
+                    label: idx for idx, label in enumerate(task['labels'])
+                }
+        if skipped_incomplete > 0:
+            logger.warning(f"Skipped {skipped_incomplete} incomplete samples")
+        logger.info(f"Loaded {len(self.samples)} valid samples for training")
+    def _is_incomplete_classification(self, classification: Dict) -> bool:
+        """Check if classification contains incomplete data (empty or '...' values)"""
+        required_tasks = [task['key'] for task in self.task_config['tasks']]
+        for task_key in required_tasks:
+            if task_key not in classification:
+                return True
+            value = classification[task_key]
+            # Check for incomplete markers
+            if not value or value == "..." or value == "" or value is None:
+                return True
+        return False
+    def __len__(self):
+        return len(self.samples)
+    def __getitem__(self, idx):
+        sample = self.samples[idx]
+        # Load image
+        image = Image.open(sample['image_path']).convert('RGB')
+        # Process image only
+        inputs = self.processor(
+            images=image,
+            return_tensors="pt"
+        )
+        # Convert classifications to labels based on task config
+        classification = sample['classification']
+        labels = {}
+        for task in self.task_config['tasks']:
+            task_key = task['key']
+            if task['type'] == 'binary':
+                # Binary tasks: convert yes/no to 1/0
+                labels[task_key] = 1 if classification[task_key] == 'yes' else 0
+            elif task['type'] == 'multi_class':
+                # Multi-class tasks: convert to index
+                label_str = classification[task_key]
+                labels[task_key] = self.label_mappings[task_key].get(label_str, 0)  # default to first class
+        return {
+            'pixel_values': inputs['pixel_values'].squeeze(0),
+            'labels': labels,
+            'metadata': {
+                'idx': sample['idx'],
+                'caption': sample['caption'],
+                'image_path': sample['image_path']
+            }
+        }
+class MultiHeadSiglipClassifier(nn.Module):
+    """SigLIP-based multi-head classifier with configurable tasks"""
+    def __init__(self, task_config: Dict, model_name: str = CKPT):
+        super().__init__()
+        self.task_config = task_config
+        self.siglip = SiglipModel.from_pretrained(model_name)
+        # Freeze SigLIP parameters initially
+        for param in self.siglip.parameters():
+            param.requires_grad = False
+        # Create classification heads dynamically based on task config
+        hidden_size = self.siglip.config.vision_config.hidden_size
+        self.classification_heads = nn.ModuleDict()
+        for task in task_config['tasks']:
+            task_key = task['key']
+            num_classes = len(task['labels'])
+            # Create linear layer for this task
+            head = nn.Linear(hidden_size, num_classes)
+            # Initialize with zeros
+            head.weight.data.zero_()
+            head.bias.data.zero_()
+            self.classification_heads[task_key] = head
+        logger.info(f"Created {len(self.classification_heads)} classification heads")
+    def forward(self, pixel_values):
+        # Get SigLIP image embeddings only
+        combined_embeds = self.siglip.get_image_features(pixel_values=pixel_values)
+        # Apply all classification heads
+        outputs = {}
+        for task_key, head in self.classification_heads.items():
+            outputs[task_key] = head(combined_embeds)
+        return outputs
+def calculate_accuracy(predictions, labels):
+    """Calculate accuracy for binary/multi-class predictions"""
+    pred_classes = torch.argmax(predictions, dim=1)
+    correct = (pred_classes == labels).float()
+    return correct.mean().item()
+def plot_validation_accuracies(history, task_config, save_path='./checkpoints/validation_accuracies.png'):
+    """Create and save validation accuracy plots"""
+    tasks = [task['key'] for task in task_config['tasks']]
+    task_names = [task['name'] for task in task_config['tasks']]
+    # Calculate grid size
+    n_tasks = len(tasks)
+    n_cols = 3
+    n_rows = (n_tasks + n_cols - 1) // n_cols  # Ceiling division
+    fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, 6 * n_rows))
+    fig.suptitle('Training Progress Dashboard', fontsize=16, fontweight='bold')
+    # Flatten axes for easier indexing
+    if n_rows == 1:
+        axes = [axes] if n_cols == 1 else axes
+    else:
+        axes = axes.flatten()
+    epochs = range(1, len(history['val_accuracy'][tasks[0]]) + 1)
+    colors = plt.cm.Set1(np.linspace(0, 1, n_tasks))
+    # Plot individual validation accuracies
+    for i, (task_key, task_name, color) in enumerate(zip(tasks, task_names, colors)):
+        if i < len(axes):
+            axes[i].plot(epochs, history['val_accuracy'][task_key],
+                        label=task_name, marker='o', color=color, linewidth=2, markersize=4)
+            axes[i].set_xlabel('Epoch')
+            axes[i].set_ylabel('Validation Accuracy')
+            axes[i].set_title(f'{task_name} Validation Accuracy')
+            axes[i].grid(True, alpha=0.3)
+            axes[i].set_ylim(0, 1)
+    # Hide unused subplots
+    for i in range(n_tasks, len(axes)):
+        axes[i].set_visible(False)
+    plt.tight_layout()
+    plt.savefig(save_path, dpi=300, bbox_inches='tight')
+    plt.close()
+    logger.info(f"Validation accuracy plots saved to {save_path}")
+    # Calculate summary statistics
+    best_accs = [max(history['val_accuracy'][task]) for task in tasks]
+    final_accs = [history['val_accuracy'][task][-1] for task in tasks]
+    return best_accs, final_accs
+def train_multi_head_classifier(data_dir: str, task_config_path: str = './task_config.json',
+                               epochs: int = 30, batch_size: int = 4):
+    """Train the multi-head SigLIP v2 classifier"""
+    logger.info("Starting multi-head classifier training...")
+    # Load task configuration
+    task_config = load_task_config(task_config_path)
+    # Create checkpoints directory
+    checkpoint_dir = Path('./checkpoints')
+    checkpoint_dir.mkdir(exist_ok=True)
+    logger.info(f"Checkpoints will be saved to: {checkpoint_dir}")
+    # Save task config to checkpoints for inference
+    with open(checkpoint_dir / 'task_config.json', 'w') as f:
+        json.dump(task_config, f, indent=2)
+    # Load processor and model
+    processor = AutoProcessor.from_pretrained(CKPT)
+    model = MultiHeadSiglipClassifier(task_config, model_name=CKPT)
+    # Dataset and dataloader
+    dataset = MultiHeadDataset(data_dir, processor, task_config)
+    if len(dataset) == 0:
+        logger.error("No training data found!")
+        return
+    # Split dataset (simple train/val split)
+    train_size = int(0.8 * len(dataset))
+    val_size = len(dataset) - train_size
+    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
+    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
+    # Setup training
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    logger.info(f"Using device: {device}")
+    model.to(device)
+    # Optimizer and loss functions
+    # Get model parameters that require gradients (only classification heads)
+    params = []
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            params.append(param)
+    optimizer = optim.AdamW(params, lr=1e-2)
+    # Linear cooldown LR scheduler
+    def linear_cooldown(epoch):
+        return max(0.1, 1.0 - (epoch / epochs))
+    scheduler = LambdaLR(optimizer, lr_lambda=linear_cooldown)
+    criterion = nn.CrossEntropyLoss()
+    # Initialize training history
+    history = {
+        'train_loss': [],
+        'val_loss': [],
+        'learning_rates': [],
+        'val_accuracy': {task['key']: [] for task in task_config['tasks']},
+        'epoch_val_accuracy': []
+    }
+    # Training loop
+    for epoch in range(epochs):
+        # Training phase
+        model.train()
+        total_train_loss = 0
+        for batch_idx, batch in enumerate(train_loader):
+            optimizer.zero_grad()
+            # Move to device
+            pixel_values = batch['pixel_values'].to(device)
+            # Forward pass
+            outputs = model(pixel_values)
+            # Calculate losses for each task
+            losses = []
+            for task in task_config['tasks']:
+                task_key = task['key']
+                labels = batch['labels'][task_key].to(device)
+                loss = criterion(outputs[task_key], labels)
+                losses.append(loss)
+            # Total loss
+            total_batch_loss = sum(losses)
+            total_batch_loss.backward()
+            optimizer.step()
+            total_train_loss += total_batch_loss.item()
+            if batch_idx % 10 == 0:
+                logger.info(f"Epoch {epoch+1}/{epochs}, Batch {batch_idx}/{len(train_loader)}, Loss: {total_batch_loss.item():.4f}")
+        avg_train_loss = total_train_loss / len(train_loader)
+        history['train_loss'].append(avg_train_loss)
+        # Record learning rate
+        current_lr = optimizer.param_groups[0]['lr']
+        history['learning_rates'].append(current_lr)
+        # Validation phase
+        model.eval()
+        total_val_loss = 0
+        val_accuracies = {task['key']: [] for task in task_config['tasks']}
+        with torch.no_grad():
+            for batch in val_loader:
+                pixel_values = batch['pixel_values'].to(device)
+                outputs = model(pixel_values)
+                # Calculate validation losses and accuracies
+                losses = []
+                for task in task_config['tasks']:
+                    task_key = task['key']
+                    labels = batch['labels'][task_key].to(device)
+                    loss = criterion(outputs[task_key], labels)
+                    losses.append(loss)
+                    # Calculate accuracy
+                    acc = calculate_accuracy(outputs[task_key], labels)
+                    val_accuracies[task_key].append(acc)
+                total_val_loss += sum(losses).item()
+        avg_val_loss = total_val_loss / len(val_loader)
+        history['val_loss'].append(avg_val_loss)
+        # Calculate average accuracies
+        epoch_accuracies = {}
+        for task in task_config['tasks']:
+            task_key = task['key']
+            avg_acc = np.mean(val_accuracies[task_key])
+            epoch_accuracies[task_key] = avg_acc
+            history['val_accuracy'][task_key].append(avg_acc)
+        history['epoch_val_accuracy'].append(epoch_accuracies.copy())
+        logger.info(f"Epoch {epoch+1}/{epochs}")
+        logger.info(f"  Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")
+        logger.info(f"  Learning Rate: {current_lr:.6f}")
+        logger.info(f"  Val Accuracies: {epoch_accuracies}")
+        # Step the learning rate scheduler
+        scheduler.step()
+    # Create comprehensive checkpoint
+    checkpoint = {
+        'epoch': epochs,
+        'model_state_dict': model.state_dict(),
+        'optimizer_state_dict': optimizer.state_dict(),
+        'scheduler_state_dict': scheduler.state_dict(),
+        'history': history,
+        'final_accuracies': epoch_accuracies,
+        'task_config': task_config
+    }
+    # Save the trained model and checkpoint
+    torch.save(model.state_dict(), checkpoint_dir / 'multi_head_siglip2_classifier.pth')
+    torch.save(checkpoint, checkpoint_dir / 'training_checkpoint.pth')
+    logger.info(f"Model saved to {checkpoint_dir / 'multi_head_siglip2_classifier.pth'}")
+    logger.info(f"Full checkpoint saved to {checkpoint_dir / 'training_checkpoint.pth'}")
+    # Save processor for inference
+    processor.save_pretrained(checkpoint_dir / 'siglip2_processor')
+    logger.info(f"Processor saved to {checkpoint_dir / 'siglip2_processor'}")
+    # Save training history as JSON
+    with open(checkpoint_dir / 'training_history.json', 'w') as f:
+        json_history = {}
+        for key, value in history.items():
+            if key == 'val_accuracy':
+                json_history[key] = {task: [float(acc) for acc in accs] for task, accs in value.items()}
+            elif key == 'epoch_val_accuracy':
+                json_history[key] = [{task: float(acc) for task, acc in epoch.items()} for epoch in value]
+            else:
+                json_history[key] = [float(x) for x in value]
+        json.dump(json_history, f, indent=2)
+    logger.info(f"Training history saved to {checkpoint_dir / 'training_history.json'}")
+    # Generate and save validation accuracy plots
+    best_accs, final_accs = plot_validation_accuracies(history, task_config, checkpoint_dir / 'validation_accuracies.png')
+    # Save detailed validation accuracy summary
+    val_summary = {
+        'best_accuracies': {
+            task['key']: float(max(history['val_accuracy'][task['key']]))
+            for task in task_config['tasks']
+        },
+        'final_accuracies': {task: float(acc) for task, acc in epoch_accuracies.items()},
+        'average_best_accuracy': float(np.mean(best_accs)),
+        'average_final_accuracy': float(np.mean(final_accs)),
+        'improvement_per_task': {
+            task['key']: float(history['val_accuracy'][task['key']][-1] - history['val_accuracy'][task['key']][0])
+            for task in task_config['tasks']
+        }
+    }
+    with open(checkpoint_dir / 'validation_summary.json', 'w') as f:
+        json.dump(val_summary, f, indent=2)
+    logger.info(f"Validation summary saved to {checkpoint_dir / 'validation_summary.json'}")
+    # Save final training summary
+    final_summary = {
+        "model_type": "SigLIP2 Multi-Head Classifier",
+        "training_samples": len(train_dataset),
+        "validation_samples": len(val_dataset),
+        "epochs": epochs,
+        "final_train_loss": avg_train_loss,
+        "final_val_loss": avg_val_loss,
+        "final_accuracies": epoch_accuracies,
+        "task_config": task_config,
+        "classification_heads": {
+            task['key']: f"{task['type']} - {task['description']}"
+            for task in task_config['tasks']
+        }
+    }
+    with open(checkpoint_dir / 'stage4_summary.json', 'w') as f:
+        json.dump(final_summary, f, indent=2)
+    logger.info(f"Stage 4 summary saved to {checkpoint_dir / 'stage4_summary.json'}")
+    # Log summary of saved artifacts
+    logger.info("="*60)
+    logger.info("TRAINING COMPLETE - ARTIFACTS SAVED:")
+    logger.info(f"📁 Checkpoint Directory: {checkpoint_dir}")
+    logger.info(f"🤖 Model Weights: multi_head_siglip2_classifier.pth")
+    logger.info(f"💾 Full Checkpoint: training_checkpoint.pth")
+    logger.info(f"🔧 Processor: siglip2_processor/")
+    logger.info(f"⚙️ Task Config: task_config.json")
+    logger.info(f"📊 Training History: training_history.json")
+    logger.info(f"📈 Validation Plots: validation_accuracies.png")
+    logger.info(f"📋 Validation Summary: validation_summary.json")
+    logger.info(f"📄 Stage Summary: stage4_summary.json")
+    logger.info("="*60)
+def main():
+    """Main execution for Stage 4"""
+    logger.info("Starting Stage 4: SigLIP v2 Multi-Head Training...")
+    # Train classifier
+    train_multi_head_classifier('./data', epochs=10, batch_size=2)
+    logger.info("Stage 4 completed successfully!")
+    logger.info("🎉 Complete pipeline finished! Check ./checkpoints/ for all training artifacts.")
+if __name__ == "__main__":
+    main()

streamlit_evaluation_app.py ADDED Viewed

	@@ -0,0 +1,695 @@

+#!/usr/bin/env python3
+"""
+Streamlit Data Viewer and Model Evaluation System
+Interactive dashboard for exploring validation results with threshold filtering
+"""
+import streamlit as st
+import json
+import pandas as pd
+import numpy as np
+from PIL import Image
+import plotly.express as px
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+import os
+from pathlib import Path
+import subprocess
+import sys
+from rapidocr import RapidOCR
+from matplotlib import pyplot as plt
+# Page config
+st.set_page_config(
+    page_title="Pseudoable Classifier Evaluation Dashboard",
+    page_icon="🔍",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Custom CSS for better styling
+st.markdown("""
+<style>
+    .main-header {
+        font-size: 3rem;
+        font-weight: bold;
+        color: #1f77b4;
+        text-align: center;
+        margin-bottom: 2rem;
+        padding: 1rem;
+        background: linear-gradient(90deg, #f0f8ff, #e6f3ff);
+        border-radius: 10px;
+    }
+    .metric-card {
+        background-color: #f8f9fa;
+        padding: 1rem;
+        border-radius: 8px;
+        border-left: 4px solid #1f77b4;
+        margin: 0.5rem 0;
+    }
+    .filter-section {
+        background-color: #ffffff;
+        padding: 1.5rem;
+        border-radius: 10px;
+        box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+        margin-bottom: 2rem;
+    }
+    .image-container {
+        border: 2px solid #e6e6e6;
+        border-radius: 8px;
+        padding: 10px;
+        margin: 10px 0;
+        background-color: #fafafa;
+    }
+    .prediction-badge {
+        display: inline-block;
+        padding: 0.25rem 0.5rem;
+        border-radius: 15px;
+        font-size: 0.8rem;
+        font-weight: bold;
+        margin: 0.2rem;
+    }
+    .correct-prediction {
+        background-color: #d4edda;
+        color: #155724;
+    }
+    .incorrect-prediction {
+        background-color: #f8d7da;
+        color: #721c24;
+    }
+</style>
+""", unsafe_allow_html=True)
+@st.cache_data
+def load_task_config(config_path: str = './task_config.json'):
+    """Load task configuration from JSON file"""
+    if not os.path.exists(config_path):
+        # Try to load from checkpoints directory
+        checkpoint_config = './checkpoints/task_config.json'
+        if os.path.exists(checkpoint_config):
+            config_path = checkpoint_config
+        else:
+            return None
+    with open(config_path, 'r') as f:
+        config = json.load(f)
+    return config
+@st.cache_data
+def load_validation_results(file_path: str = './validation_results.json'):
+    """Load validation results from JSON file"""
+    if not os.path.exists(file_path):
+        return None
+    with open(file_path, 'r') as f:
+        data = json.load(f)
+    return data
+@st.cache_resource
+def get_ocr_engine():
+    """Initialize and cache OCR engine"""
+    return RapidOCR()
+@st.cache_data
+def extract_text_from_image(image_path: str):
+    """Extract text from image using OCR"""
+    try:
+        engine = get_ocr_engine()
+        result = engine(image_path)
+        # Handle new RapidOCR output format
+        if result and hasattr(result, 'txts') and result.txts:
+            texts = result.txts
+            return {
+                'text': ' '.join(texts) if texts else '',
+                'num_text_blocks': len(texts),
+                'has_text': len(texts) > 0
+            }
+        elif result and isinstance(result, (list, tuple)) and len(result) > 0:
+            # Fallback for older format
+            texts = []
+            for item in result:
+                if len(item) >= 2:
+                    texts.append(item[1])
+            return {
+                'text': ' '.join(texts) if texts else '',
+                'num_text_blocks': len(texts),
+                'has_text': len(texts) > 0
+            }
+        else:
+            return {
+                'text': '',
+                'num_text_blocks': 0,
+                'has_text': False
+            }
+    except Exception as e:
+        return {
+            'text': f'OCR Error: {str(e)}',
+            'num_text_blocks': 0,
+            'has_text': False
+        }
+@st.cache_data
+def process_validation_data(validation_data, task_config):
+    """Process validation data into DataFrame for easier filtering"""
+    if not validation_data or not task_config:
+        return None
+    rows = []
+    tasks = {task['key']: task for task in task_config['tasks']}
+    for result in validation_data['results']:
+        row = {
+            'idx': result['idx'],
+            'caption': result['caption'],
+            'image_path': result['image_path'],
+            'url': result['url'],
+            'hash': result['hash']
+        }
+        # Ground truth and predictions
+        gt = result['ground_truth']
+        pred = result['predictions']
+        # Process each task dynamically
+        for task_key, task_info in tasks.items():
+            # Ground truth
+            row[f'gt_{task_key}'] = gt[task_key]
+            # Predictions
+            row[f'pred_{task_key}'] = pred[f'{task_key}_prediction']
+            row[f'conf_{task_key}'] = pred[f'{task_key}_confidence']
+            # For binary tasks, also include probability for 'yes'
+            if task_info['type'] == 'binary':
+                row[f'prob_{task_key}_yes'] = pred.get(f'{task_key}_prob_yes', 0.5)
+            # Correctness
+            row[f'correct_{task_key}'] = gt[task_key] == pred[f'{task_key}_prediction']
+        rows.append(row)
+    return pd.DataFrame(rows)
+def run_validation_if_needed():
+    """Run validation if results don't exist"""
+    if not os.path.exists('./validation_results.json'):
+        st.warning("Validation results not found. Running validation...")
+        # Check if model exists
+        if not os.path.exists('./checkpoints/multi_head_siglip2_classifier.pth'):
+            st.error("❌ Trained model not found! Please run the training pipeline first.")
+            st.code("python stage_4.py")
+            return False
+        # Run validation
+        with st.spinner("Running model on validation set... This may take a few minutes."):
+            try:
+                result = subprocess.run([sys.executable, 'validation_runner.py'],
+                                      capture_output=True, text=True)
+                if result.returncode == 0:
+                    st.success("✅ Validation completed successfully!")
+                    st.rerun()
+                else:
+                    st.error(f"❌ Validation failed: {result.stderr}")
+                    return False
+            except Exception as e:
+                st.error(f"❌ Error running validation: {e}")
+                return False
+    return True
+def create_overview_metrics(df, validation_data, task_config):
+    """Create overview metrics section"""
+    st.markdown("## 📊 Overview Metrics")
+    tasks = [task['key'] for task in task_config['tasks']]
+    # Basic stats
+    col1, col2, col3, col4 = st.columns(4)
+    with col1:
+        st.metric("Total Samples", len(df))
+    with col2:
+        avg_confidence = np.mean([df[f'conf_{task}'].mean() for task in tasks])
+        st.metric("Avg Confidence", f"{avg_confidence:.3f}")
+    with col3:
+        overall_accuracy = np.mean([df[f'correct_{task}'].mean() for task in tasks])
+        st.metric("Overall Accuracy", f"{overall_accuracy:.3f}")
+    with col4:
+        if validation_data and 'metadata' in validation_data:
+            model_accuracies = validation_data['metadata']['validation_accuracies']
+            model_avg = np.mean(list(model_accuracies.values()))
+            st.metric("Model Accuracy", f"{model_avg:.3f}")
+    # Detailed accuracies
+    st.markdown("### 🎯 Accuracy per Classification Task")
+    # Create dynamic columns based on number of tasks
+    n_tasks = len(tasks)
+    n_cols = min(5, n_tasks)  # Max 5 columns
+    acc_cols = st.columns(n_cols)
+    for i, task in enumerate(tasks):
+        task_info = next(t for t in task_config['tasks'] if t['key'] == task)
+        with acc_cols[i % n_cols]:
+            accuracy = df[f'correct_{task}'].mean()
+            st.metric(task_info['name'], f"{accuracy:.3f}")
+def create_confidence_distribution_plot(df, task_config):
+    """Create confidence distribution plots"""
+    tasks = [task['key'] for task in task_config['tasks']]
+    task_names = [task['name'] for task in task_config['tasks']]
+    n_tasks = len(tasks)
+    n_cols = 3
+    n_rows = (n_tasks + n_cols - 1) // n_cols
+    fig = make_subplots(
+        rows=n_rows, cols=n_cols,
+        subplot_titles=task_names,
+        specs=[[{"secondary_y": False} for _ in range(n_cols)] for _ in range(n_rows)]
+    )
+    colors = plt.cm.Set1(np.linspace(0, 1, n_tasks))
+    for i, (task_key, color) in enumerate(zip(tasks, colors)):
+        row = (i // n_cols) + 1
+        col = (i % n_cols) + 1
+        fig.add_trace(
+            go.Histogram(
+                x=df[f'conf_{task_key}'],
+                nbinsx=20,
+                name=f'{task_key}',
+                marker_color=f'rgba({color[0]*255:.0f},{color[1]*255:.0f},{color[2]*255:.0f},0.7)',
+                opacity=0.7
+            ),
+            row=row, col=col
+        )
+    fig.update_layout(
+        title="Confidence Score Distributions",
+        showlegend=False,
+        height=200 * n_rows + 100
+    )
+    return fig
+def apply_filters(df, task_config):
+    """Apply user-defined filters to the dataframe"""
+    st.markdown("## 🔍 Filter Data")
+    tasks = {task['key']: task for task in task_config['tasks']}
+    # Create filter sidebar
+    with st.sidebar:
+        st.markdown("### Task Confidence Filters")
+        # Confidence thresholds for each task
+        confidence_filters = {}
+        for task_key, task_info in tasks.items():
+            if task_info['type'] == 'multi_class':
+                # Only show confidence threshold for multi-class tasks
+                confidence_filters[task_key] = st.slider(
+                    f"{task_info['name']} Confidence",
+                    0.0, 1.0, 0.5, 0.01,
+                    key=f"conf_{task_key}"
+                )
+        st.markdown("### Content Filters")
+        # Category filters (for multi-class tasks)
+        category_filters = {}
+        for task_key, task_info in tasks.items():
+            if task_info['type'] == 'multi_class':
+                available_values = df[f'gt_{task_key}'].unique().tolist()
+                selected_values = st.multiselect(
+                    f"Ground Truth {task_info['name']}",
+                    available_values,
+                    default=available_values,
+                    key=f"gt_{task_key}_filter"
+                )
+                category_filters[task_key] = selected_values
+        # Binary prediction filters
+        st.markdown("**Filter by Predictions:**")
+        prediction_filters = {}
+        for task_key, task_info in tasks.items():
+            if task_info['type'] == 'binary':
+                filter_value = st.selectbox(
+                    f"{task_info['name']}:",
+                    ["All", "Yes only", "No only"],
+                    key=f"pred_{task_key}_filter"
+                )
+                prediction_filters[task_key] = filter_value
+        # Correctness filter
+        st.markdown("**Filter by Correctness:**")
+        correctness_filter = st.selectbox(
+            "Show only:",
+            ["All predictions", "Correct predictions", "Incorrect predictions"]
+        )
+        # OCR filters (if screenshot task exists)
+        has_screenshot_task = any(task['key'] == 'is_screenshot_with_text' for task in task_config['tasks'])
+        if has_screenshot_task:
+            st.markdown("**Filter by Text Content:**")
+            ocr_filter = st.selectbox(
+                "Text Content:",
+                ["All images", "Images with text", "Images without text"],
+                key="ocr_filter"
+            )
+            enable_ocr = st.checkbox("Enable OCR text extraction", value=True)
+        else:
+            ocr_filter = "All images"
+            enable_ocr = False
+    # Apply filters
+    filtered_df = df.copy()
+    # Confidence filters
+    for task_key, threshold in confidence_filters.items():
+        filtered_df = filtered_df[filtered_df[f'conf_{task_key}'] >= threshold]
+    # Category filters
+    for task_key, selected_values in category_filters.items():
+        filtered_df = filtered_df[filtered_df[f'gt_{task_key}'].isin(selected_values)]
+    # Binary prediction filters
+    for task_key, filter_value in prediction_filters.items():
+        if filter_value == "Yes only":
+            filtered_df = filtered_df[filtered_df[f'pred_{task_key}'] == 'yes']
+        elif filter_value == "No only":
+            filtered_df = filtered_df[filtered_df[f'pred_{task_key}'] == 'no']
+    # Correctness filter
+    if correctness_filter == "Correct predictions":
+        correct_mask = True
+        for task_key in tasks.keys():
+            correct_mask = correct_mask & filtered_df[f'correct_{task_key}']
+        filtered_df = filtered_df[correct_mask]
+    elif correctness_filter == "Incorrect predictions":
+        correct_mask = True
+        for task_key in tasks.keys():
+            correct_mask = correct_mask & filtered_df[f'correct_{task_key}']
+        filtered_df = filtered_df[~correct_mask]
+    # Show filter results
+    st.info(f"Filtered to {len(filtered_df)} samples (from {len(df)} total)")
+    return filtered_df, ocr_filter, enable_ocr
+def display_sample_images(df, task_config, ocr_filter="All images", enable_ocr=True):
+    """Display sample images with predictions and ground truth"""
+    st.markdown("## 🖼️ Sample Images")
+    if len(df) == 0:
+        st.warning("No images match the current filters.")
+        return
+    tasks = {task['key']: task for task in task_config['tasks']}
+    # Add controls for image display
+    col1, col2, col3 = st.columns([2, 1, 1])
+    with col1:
+        max_images = st.slider(
+            "Number of images to display",
+            min_value=10,
+            max_value=min(200, len(df)),
+            value=min(50, len(df)),
+            step=10
+        )
+    with col2:
+        sort_by = st.selectbox(
+            "Sort by:",
+            ["Original order", "Confidence (low to high)", "Confidence (high to low)"]
+        )
+    with col3:
+        cols_per_row = st.selectbox("Images per row:", [2, 3, 4], index=1)
+    # Sort dataframe if requested
+    task_keys = list(tasks.keys())
+    if sort_by == "Confidence (low to high)":
+        avg_conf = sum(df[f'conf_{task}'] for task in task_keys) / len(task_keys)
+        display_df = df.iloc[avg_conf.argsort()].head(max_images)
+    elif sort_by == "Confidence (high to low)":
+        avg_conf = sum(df[f'conf_{task}'] for task in task_keys) / len(task_keys)
+        display_df = df.iloc[avg_conf.argsort()[::-1]].head(max_images)
+    else:
+        display_df = df.head(max_images)
+    # Apply OCR filtering if needed
+    if enable_ocr and ocr_filter != "All images":
+        st.info("🔍 Applying OCR filtering... This may take a moment for many images.")
+        ocr_results = []
+        progress_bar = st.progress(0)
+        for idx, (_, row) in enumerate(display_df.iterrows()):
+            if os.path.exists(row['image_path']):
+                ocr_result = extract_text_from_image(row['image_path'])
+                ocr_results.append(ocr_result['has_text'])
+            else:
+                ocr_results.append(False)
+            progress_bar.progress((idx + 1) / len(display_df))
+        progress_bar.empty()
+        # Filter based on OCR results
+        if ocr_filter == "Images with text":
+            mask = ocr_results
+        else:  # "Images without text"
+            mask = [not has_text for has_text in ocr_results]
+        display_df = display_df[mask].reset_index(drop=True)
+        st.success(f"OCR filtering complete. Found {len(display_df)} images matching criteria.")
+    # Display images
+    for i in range(0, len(display_df), cols_per_row):
+        cols = st.columns(cols_per_row)
+        for j in range(cols_per_row):
+            if i + j < len(display_df):
+                row = display_df.iloc[i + j]
+                with cols[j]:
+                    # Load and display image
+                    try:
+                        if os.path.exists(row['image_path']):
+                            img = Image.open(row['image_path'])
+                            st.image(img, caption=f"Sample {row['idx']}", use_column_width=True)
+                        else:
+                            st.error(f"Image not found: {row['image_path']}")
+                            continue
+                    except Exception as e:
+                        st.error(f"Error loading image: {e}")
+                        continue
+                    # Caption
+                    st.markdown(f"**Caption:** {row['caption'][:100]}...")
+                    # OCR Text Extraction
+                    if enable_ocr and 'is_screenshot_with_text' in tasks:
+                        with st.expander("🔍 Extracted Text (OCR)", expanded=False):
+                            ocr_result = extract_text_from_image(row['image_path'])
+                            if ocr_result['has_text']:
+                                st.markdown(f"**Text Blocks Found:** {ocr_result['num_text_blocks']}")
+                                st.text_area(
+                                    "Extracted Text:",
+                                    value=ocr_result['text'],
+                                    height=100,
+                                    key=f"ocr_text_{row['idx']}",
+                                    help="Text extracted from the image using OCR"
+                                )
+                                text_length = len(ocr_result['text'])
+                                word_count = len(ocr_result['text'].split())
+                                st.caption(f"📊 Text stats: {text_length} chars, {word_count} words")
+                                if row['pred_is_screenshot_with_text'] == 'yes':
+                                    st.success("✅ Screenshot prediction correlates with text presence")
+                                elif ocr_result['num_text_blocks'] > 5:
+                                    st.warning("⚠️ High text content but not predicted as screenshot")
+                            else:
+                                st.info("No text detected in this image")
+                                if row['pred_is_screenshot_with_text'] == 'yes':
+                                    st.warning("⚠️ Predicted as screenshot but no text found")
+                    # Predictions vs Ground Truth
+                    st.markdown("**Predictions vs Ground Truth:**")
+                    # Display all tasks dynamically
+                    for task_key, task_info in tasks.items():
+                        pred_val = row[f'pred_{task_key}']
+                        gt_val = row[f'gt_{task_key}']
+                        conf_val = row[f'conf_{task_key}']
+                        correct = pred_val == gt_val
+                        badge_class = "correct-prediction" if correct else "incorrect-prediction"
+                        st.markdown(f"""
+                        <div class="prediction-badge {badge_class}">
+                            {task_info['name']}: {pred_val} | GT: {gt_val} | Conf: {conf_val:.3f}
+                        </div>
+                        """, unsafe_allow_html=True)
+                    st.markdown("---")
+    if len(df) > max_images:
+        st.info(f"Showing {max_images} of {len(df)} filtered images. Use the slider above to show more images.")
+def create_confusion_matrices(df, task_config):
+    """Create confusion matrices for each classification task"""
+    st.markdown("## 📊 Model Performance Analysis")
+    tasks = {task['key']: task for task in task_config['tasks']}
+    binary_tasks = [t for t in tasks.values() if t['type'] == 'binary']
+    multi_class_tasks = [t for t in tasks.values() if t['type'] == 'multi_class']
+    tab1, tab2, tab3 = st.tabs(["Confusion Matrices", "Confidence Analysis", "Task Performance"])
+    with tab1:
+        # Binary classification confusion matrices
+        if binary_tasks:
+            st.markdown("### Binary Classification Tasks")
+            n_binary = len(binary_tasks)
+            n_cols = min(2, n_binary)
+            for i in range(0, n_binary, n_cols):
+                cols = st.columns(n_cols)
+                for j in range(n_cols):
+                    if i + j < n_binary:
+                        task = binary_tasks[i + j]
+                        task_key = task['key']
+                        with cols[j]:
+                            confusion_data = pd.crosstab(
+                                df[f'gt_{task_key}'],
+                                df[f'pred_{task_key}'],
+                                margins=True
+                            )
+                            st.markdown(f"**{task['name']} Confusion Matrix**")
+                            st.dataframe(confusion_data, use_container_width=True)
+        # Multi-class confusion matrices
+        if multi_class_tasks:
+            st.markdown("### Multi-class Classification Tasks")
+            for task in multi_class_tasks:
+                task_key = task['key']
+                st.markdown(f"**{task['name']} Confusion Matrix**")
+                confusion_data = pd.crosstab(
+                    df[f'gt_{task_key}'],
+                    df[f'pred_{task_key}'],
+                    margins=True
+                )
+                st.dataframe(confusion_data, use_container_width=True)
+    with tab2:
+        # Confidence analysis plots
+        fig1 = create_confidence_distribution_plot(df, task_config)
+        st.plotly_chart(fig1, use_container_width=True)
+    with tab3:
+        # Task-wise performance
+        st.markdown("**Performance by Task**")
+        performance_data = []
+        for task_key, task_info in tasks.items():
+            accuracy = df[f'correct_{task_key}'].mean()
+            confidence = df[f'conf_{task_key}'].mean()
+            performance_data.append({
+                'Task': task_info['name'],
+                'Type': task_info['type'],
+                'Accuracy': accuracy,
+                'Avg Confidence': confidence
+            })
+        performance_df = pd.DataFrame(performance_data)
+        st.dataframe(performance_df, use_container_width=True)
+        # Performance visualization
+        fig = px.scatter(performance_df, x='Avg Confidence', y='Accuracy',
+                        color='Type', text='Task',
+                        title="Task Performance: Accuracy vs Confidence")
+        fig.update_traces(textposition="top center")
+        st.plotly_chart(fig, use_container_width=True)
+def main():
+    """Main Streamlit application"""
+    # Header
+    st.markdown('<div class="main-header">🔍 Pseudoable Classifier Evaluation Dashboard</div>',
+                unsafe_allow_html=True)
+    # Load task configuration
+    task_config = load_task_config()
+    if not task_config:
+        st.error("❌ Could not load task configuration. Please ensure task_config.json exists.")
+        st.info("Expected location: ./task_config.json or ./checkpoints/task_config.json")
+        return
+    st.success(f"✅ Loaded task configuration with {len(task_config['tasks'])} tasks")
+    # Display task information
+    with st.expander("📋 Task Configuration", expanded=False):
+        for task in task_config['tasks']:
+            st.markdown(f"**{task['name']}** ({task['type']})")
+            st.markdown(f"- *Description:* {task['description']}")
+            st.markdown(f"- *Labels:* {', '.join(task['labels'])}")
+            st.markdown("---")
+    # Check and run validation if needed
+    if not run_validation_if_needed():
+        return
+    # Load validation results
+    validation_data = load_validation_results()
+    if not validation_data:
+        st.error("❌ Could not load validation results. Please check if validation_results.json exists.")
+        return
+    # Process data
+    df = process_validation_data(validation_data, task_config)
+    if df is None or len(df) == 0:
+        st.error("❌ No validation data found or data processing failed.")
+        return
+    # Show basic info
+    st.success(f"✅ Loaded {len(df)} validation samples successfully!")
+    # Overview metrics
+    create_overview_metrics(df, validation_data, task_config)
+    # Apply filters
+    filtered_df, ocr_filter, enable_ocr = apply_filters(df, task_config)
+    # Display results
+    if len(filtered_df) > 0:
+        # Performance analysis
+        create_confusion_matrices(filtered_df, task_config)
+        # Sample images
+        display_sample_images(filtered_df, task_config, ocr_filter, enable_ocr)
+    else:
+        st.warning("⚠️ No samples match the current filter criteria. Please adjust your filters.")
+    # Footer
+    st.markdown("---")
+    st.markdown("**📝 Instructions:**")
+    st.markdown("1. Use the sidebar to filter by task confidence and prediction classes")
+    st.markdown("2. Filter images by text content using OCR (if screenshot detection task is configured)")
+    st.markdown("3. Adjust the number of images to display and sorting order")
+    st.markdown("4. View model performance metrics and confusion matrices")
+    st.markdown("5. Browse sample images with predictions vs ground truth")
+    st.markdown("6. Green badges indicate correct predictions, red badges indicate incorrect predictions")
+if __name__ == "__main__":
+    main()

upload_to_hf.py ADDED Viewed

	@@ -0,0 +1,112 @@

+#!/usr/bin/env python3
+"""
+Upload trained multi-head SigLIP2 classifier to Hugging Face Hub (private).
+This script will create/update the repo `fal/multihead_cls` and push:
+- model weights: checkpoints/multi_head_siglip2_classifier.pth
+- full training checkpoint: checkpoints/training_checkpoint.pth (optional)
+- processor folder: checkpoints/siglip2_processor/
+- README.md with usage
+Auth: Set HUGGINGFACE_TOKEN environment variable or run `huggingface-cli login`.
+"""
+import os
+import shutil
+from pathlib import Path
+from typing import Optional
+from huggingface_hub import HfApi, HfFolder, create_repo, upload_folder, upload_file
+REPO_ID = "fal/multihead_cls"
+def ensure_logged_in() -> Optional[str]:
+    token = os.getenv("HUGGINGFACE_TOKEN") or HfFolder.get_token()
+    if not token:
+        raise RuntimeError(
+            "No Hugging Face token found. Set HUGGINGFACE_TOKEN or run `huggingface-cli login`."
+        )
+    return token
+def prepare_staging_dir() -> Path:
+    root = Path(__file__).parent
+    ckpt_dir = root / "checkpoints"
+    if not ckpt_dir.exists():
+        raise FileNotFoundError("checkpoints/ directory not found. Train the model first.")
+    required = [
+        ckpt_dir / "multi_head_siglip2_classifier.pth",
+        ckpt_dir / "siglip2_processor",
+    ]
+    for path in required:
+        if not path.exists():
+            raise FileNotFoundError(f"Missing required artifact: {path}")
+    # Check for task_config.json in checkpoints or root directory
+    task_config_path = ckpt_dir / "task_config.json"
+    if not task_config_path.exists():
+        task_config_path = root / "task_config.json"
+        if not task_config_path.exists():
+            raise FileNotFoundError("Missing required artifact: task_config.json (checked both checkpoints/ and root directory)")
+    staging = root / "hf_export"
+    if staging.exists():
+        shutil.rmtree(staging)
+    staging.mkdir(parents=True)
+    # Copy artifacts
+    shutil.copy2(ckpt_dir / "multi_head_siglip2_classifier.pth", staging / "model.pth")
+    shutil.copy2(task_config_path, staging / "task_config.json")
+    # Optional: training checkpoint and other metadata
+    train_ckpt = ckpt_dir / "training_checkpoint.pth"
+    if train_ckpt.exists():
+        shutil.copy2(train_ckpt, staging / "training_checkpoint.pth")
+    # Optional: training history and validation summary
+    for optional_file in ["training_history.json", "validation_summary.json", "stage4_summary.json"]:
+        optional_path = ckpt_dir / optional_file
+        if optional_path.exists():
+            shutil.copy2(optional_path, staging / optional_file)
+    # Processor
+    shutil.copytree(ckpt_dir / "siglip2_processor", staging / "processor")
+    # Add example and README if present
+    readme_src = root / "README.md"
+    if readme_src.exists():
+        shutil.copy2(readme_src, staging / "README.md")
+    example_src = root / "example.py"
+    if example_src.exists():
+        shutil.copy2(example_src, staging / "example.py")
+    return staging
+def upload_to_hub(private: bool = True) -> None:
+    token = ensure_logged_in()
+    api = HfApi(token=token)
+    create_repo(REPO_ID, private=private, repo_type="model", exist_ok=True, token=token)
+    staging = prepare_staging_dir()
+    # Upload all files in staging
+    upload_folder(
+        folder_path=str(staging),
+        repo_id=REPO_ID,
+        repo_type="model",
+        commit_message="Upload multi-head SigLIP2 classifier with dynamic task configuration",
+        token=token,
+    )
+    print(f"Uploaded to https://huggingface.co/{REPO_ID} (private={private})")
+if __name__ == "__main__":
+    upload_to_hub(private=True)

validation_runner.py ADDED Viewed

	@@ -0,0 +1,286 @@

+#!/usr/bin/env python3
+"""
+Validation Runner: Runs trained model on validation set and saves predictions
+"""
+import os
+import json
+import torch
+import numpy as np
+from PIL import Image
+from pathlib import Path
+import logging
+from transformers import AutoProcessor
+from stage_4 import MultiHeadSiglipClassifier, CKPT, load_task_config
+import pandas as pd
+from tqdm import tqdm
+# Set up logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+def _is_incomplete_classification(classification: dict, task_config: dict) -> bool:
+    """Check if classification contains incomplete data (empty or '...' values)"""
+    if not task_config or 'tasks' not in task_config:
+        return True
+    required_tasks = [task['key'] for task in task_config['tasks']]
+    for task_key in required_tasks:
+        if task_key not in classification:
+            return True
+        value = classification[task_key]
+        # Check for incomplete markers
+        if not value or value == "..." or value == "" or value is None:
+            return True
+    return False
+def load_trained_model(checkpoint_dir: str = './checkpoints'):
+    """Load the trained model and processor"""
+    checkpoint_path = Path(checkpoint_dir)
+    # Load task configuration
+    task_config_path = checkpoint_path / 'task_config.json'
+    if not task_config_path.exists():
+        # Fallback to root directory
+        task_config_path = './task_config.json'
+    task_config = load_task_config(str(task_config_path))
+    # Load processor
+    processor = AutoProcessor.from_pretrained(CKPT)
+    # Load model with task config
+    model = MultiHeadSiglipClassifier(task_config)
+    model_state = torch.load(checkpoint_path / 'multi_head_siglip2_classifier.pth', map_location='cpu')
+    model.load_state_dict(model_state)
+    # Set to evaluation mode
+    model.eval()
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model.to(device)
+    logger.info(f"Model loaded successfully on device: {device}")
+    return model, processor, device, task_config
+def load_validation_data(data_dir: str = './data', task_config: dict = None):
+    """Load validation samples from stage 2 metadata files"""
+    data_path = Path(data_dir)
+    # Load from stage 2 metadata files
+    metadata_dir = data_path / 'metadata'
+    if not metadata_dir.exists():
+        logger.error("Metadata directory not found. Run stages 1 and 2 first.")
+        return []
+    metadata_files = list(metadata_dir.glob('meta_*_stage2.json'))
+    if not metadata_files:
+        logger.error("No stage 2 metadata files found. Run stage 2 first.")
+        return []
+    samples = []
+    skipped_incomplete = 0
+    for meta_file in tqdm(metadata_files, desc="Loading validation data"):
+        try:
+            with open(meta_file, 'r') as f:
+                metadata = json.load(f)
+            # Check if classification is complete
+            if not metadata.get('stage2_complete', False):
+                logger.warning(f"Skipping {meta_file} - classification not complete")
+                skipped_incomplete += 1
+                continue
+            # Check if classification contains incomplete data
+            classification = metadata.get('classification', {})
+            if not classification or _is_incomplete_classification(classification, task_config):
+                logger.warning(f"Skipping {meta_file} - incomplete classification data")
+                skipped_incomplete += 1
+                continue
+            # Check if image exists
+            image_path = metadata['image_path']
+            if not os.path.exists(image_path):
+                logger.warning(f"Image not found: {image_path}")
+                skipped_incomplete += 1
+                continue
+            samples.append({
+                'idx': metadata['idx'],
+                'image_path': metadata['image_path'],
+                'caption': metadata['caption'],
+                'url': metadata['url'],
+                'hash': metadata['hash'],
+                'ground_truth': metadata['classification']
+            })
+        except Exception as e:
+            logger.warning(f"Error loading {meta_file}: {e}")
+            skipped_incomplete += 1
+    if skipped_incomplete > 0:
+        logger.warning(f"Skipped {skipped_incomplete} incomplete samples")
+    logger.info(f"Loaded {len(samples)} valid validation samples")
+    return samples
+def predict_batch(model, processor, images, device, task_config, batch_size=8):
+    """Run predictions on a batch of images"""
+    predictions = []
+    tasks = {task['key']: task for task in task_config['tasks']}
+    for i in range(0, len(images), batch_size):
+        batch_images = images[i:i+batch_size]
+        # Process images
+        inputs = processor(images=batch_images, return_tensors="pt")
+        pixel_values = inputs['pixel_values'].to(device)
+        with torch.no_grad():
+            outputs = model(pixel_values)
+            # Convert outputs to probabilities and predictions
+            batch_preds = []
+            for j in range(len(batch_images)):
+                pred = {}
+                # Process each task dynamically
+                for task_key, task_info in tasks.items():
+                    logits = outputs[task_key][j]
+                    probs = torch.softmax(logits, dim=0)
+                    pred_class = torch.argmax(logits).item()
+                    confidence = probs[pred_class].item()
+                    if task_info['type'] == 'binary':
+                        # Binary classification
+                        pred[f'{task_key}_prediction'] = 'yes' if pred_class == 1 else 'no'
+                        pred[f'{task_key}_confidence'] = confidence
+                        pred[f'{task_key}_prob_yes'] = probs[1].item()
+                        pred[f'{task_key}_prob_no'] = probs[0].item()
+                    elif task_info['type'] == 'multi_class':
+                        # Multi-class classification
+                        pred_label = task_info['labels'][pred_class]
+                        pred[f'{task_key}_prediction'] = pred_label
+                        pred[f'{task_key}_confidence'] = confidence
+                        # Add probabilities for all classes
+                        for idx, label in enumerate(task_info['labels']):
+                            pred[f'{task_key}_prob_{label}'] = probs[idx].item()
+                batch_preds.append(pred)
+            predictions.extend(batch_preds)
+    return predictions
+def calculate_accuracies(predictions, ground_truths, task_config):
+    """Calculate accuracies for each task"""
+    accuracies = {}
+    tasks = {task['key']: task for task in task_config['tasks']}
+    for task_key, task_info in tasks.items():
+        pred_key = f'{task_key}_prediction'
+        correct = sum(1 for pred, gt in zip(predictions, ground_truths)
+                     if pred[pred_key] == gt[task_key])
+        total = len(predictions)
+        accuracies[f'{task_key}_accuracy'] = correct / total if total > 0 else 0
+    return accuracies
+def run_validation(data_dir: str = './data', checkpoint_dir: str = './checkpoints',
+                   output_file: str = './validation_results.json'):
+    """Run complete validation and save results"""
+    logger.info("Starting validation run...")
+    # Load model and data
+    model, processor, device, task_config = load_trained_model(checkpoint_dir)
+    samples = load_validation_data(data_dir, task_config)
+    if not samples:
+        logger.error("No validation samples found!")
+        return
+    # Prepare images for batch processing
+    images = []
+    for sample in tqdm(samples, desc="Loading images"):
+        try:
+            img = Image.open(sample['image_path']).convert('RGB')
+            images.append(img)
+        except Exception as e:
+            logger.error(f"Error loading image {sample['image_path']}: {e}")
+            images.append(None)
+    # Filter out failed images
+    valid_samples = []
+    valid_images = []
+    for sample, img in zip(samples, images):
+        if img is not None:
+            valid_samples.append(sample)
+            valid_images.append(img)
+    logger.info(f"Running predictions on {len(valid_samples)} valid samples...")
+    # Run predictions
+    predictions = predict_batch(model, processor, valid_images, device, task_config)
+    # Calculate accuracies
+    ground_truths = [sample['ground_truth'] for sample in valid_samples]
+    accuracies = calculate_accuracies(predictions, ground_truths, task_config)
+    # Combine results
+    validation_results = []
+    for sample, prediction in zip(valid_samples, predictions):
+        result = {
+            **sample,
+            'predictions': prediction
+        }
+        validation_results.append(result)
+    # Create final output
+    output_data = {
+        'metadata': {
+            'total_samples': len(validation_results),
+            'model_checkpoint': checkpoint_dir,
+            'validation_accuracies': accuracies,
+            'task_config': task_config,
+            'timestamp': pd.Timestamp.now().isoformat()
+        },
+        'results': validation_results
+    }
+    # Save results
+    output_path = Path(output_file)
+    with open(output_path, 'w') as f:
+        json.dump(output_data, f, indent=2)
+    logger.info(f"Validation results saved to {output_path}")
+    logger.info("Validation Accuracies:")
+    for key, value in accuracies.items():
+        logger.info(f"  {key}: {value:.4f}")
+    return output_data
+def main():
+    """Main execution"""
+    logger.info("Starting validation runner...")
+    # Check if model exists
+    if not Path('./checkpoints/multi_head_siglip2_classifier.pth').exists():
+        logger.error("Trained model not found! Run stage 4 first.")
+        return
+    # Run validation
+    results = run_validation()
+    if results:
+        logger.info("Validation completed successfully!")
+    else:
+        logger.error("Validation failed!")
+if __name__ == "__main__":
+    main()