Spaces:

VOIDER
/

image-evaluation-tool

Sleeping

App Files Files Community

VOIDER commited on Jul 19

Commit

fef69d7

verified ·

1 Parent(s): 51aec78

Delete utils/scoring.py

Browse files

Files changed (1) hide show

utils/scoring.py +0 -359

utils/scoring.py DELETED Viewed

@@ -1,359 +0,0 @@
-import numpy as np
-import logging
-logger = logging.getLogger(__name__)
-def calculate_final_score(
-    quality_score: float,
-    aesthetics_score: float,
-    prompt_score: float,
-    ai_detection_score: float,
-    has_prompt: bool = True
-) -> float:
-    """
-    Calculate weighted composite score for image evaluation
-    Args:
-        quality_score: Technical image quality (0-10)
-        aesthetics_score: Visual appeal score (0-10)
-        prompt_score: Prompt adherence score (0-10)
-        ai_detection_score: AI generation probability (0-1)
-        has_prompt: Whether prompt metadata is available
-    Returns:
-        Final composite score (0-10)
-    """
-    try:
-        # Validate input scores
-        quality_score = max(0.0, min(10.0, quality_score))
-        aesthetics_score = max(0.0, min(10.0, aesthetics_score))
-        prompt_score = max(0.0, min(10.0, prompt_score))
-        ai_detection_score = max(0.0, min(1.0, ai_detection_score))
-        if has_prompt:
-            # Standard weights when prompt is available
-            weights = {
-                'quality': 0.25,      # 25% - Technical quality
-                'aesthetics': 0.35,   # 35% - Visual appeal (highest weight)
-                'prompt': 0.25,       # 25% - Prompt following
-                'ai_detection': 0.15  # 15% - AI detection (inverted)
-            }
-            # Calculate weighted score
-            score = (
-                quality_score * weights['quality'] +
-                aesthetics_score * weights['aesthetics'] +
-                prompt_score * weights['prompt'] +
-                (1 - ai_detection_score) * weights['ai_detection']
-            )
-        else:
-            # Redistribute prompt weight when no prompt available
-            weights = {
-                'quality': 0.375,     # 25% + 12.5% from prompt
-                'aesthetics': 0.475,  # 35% + 12.5% from prompt
-                'ai_detection': 0.15  # 15% - AI detection (inverted)
-            }
-            # Calculate weighted score without prompt
-            score = (
-                quality_score * weights['quality'] +
-                aesthetics_score * weights['aesthetics'] +
-                (1 - ai_detection_score) * weights['ai_detection']
-            )
-        # Ensure score is in valid range
-        final_score = max(0.0, min(10.0, score))
-        logger.debug(f"Score calculation - Quality: {quality_score:.2f}, "
-                    f"Aesthetics: {aesthetics_score:.2f}, Prompt: {prompt_score:.2f}, "
-                    f"AI Detection: {ai_detection_score:.3f}, Has Prompt: {has_prompt}, "
-                    f"Final: {final_score:.2f}")
-        return final_score
-    except Exception as e:
-        logger.error(f"Error calculating final score: {str(e)}")
-        return 5.0  # Default neutral score
-def calculate_category_rankings(scores_list: list, category: str) -> list:
-    """
-    Calculate rankings for a specific category
-    Args:
-        scores_list: List of score dictionaries
-        category: Category to rank by ('quality_score', 'aesthetics_score', etc.)
-    Returns:
-        List of rankings (1-based)
-    """
-    try:
-        if not scores_list or category not in scores_list[0]:
-            return [1] * len(scores_list)
-        # Extract scores for the category
-        category_scores = [item[category] for item in scores_list]
-        # Calculate rankings (higher score = better rank)
-        rankings = []
-        for i, score in enumerate(category_scores):
-            rank = 1
-            for j, other_score in enumerate(category_scores):
-                if other_score > score:
-                    rank += 1
-            rankings.append(rank)
-        return rankings
-    except Exception as e:
-        logger.error(f"Error calculating category rankings: {str(e)}")
-        return list(range(1, len(scores_list) + 1))
-def normalize_scores(scores: list, target_range: tuple = (0, 10)) -> list:
-    """
-    Normalize a list of scores to a target range
-    Args:
-        scores: List of numerical scores
-        target_range: Tuple of (min, max) for target range
-    Returns:
-        List of normalized scores
-    """
-    try:
-        if not scores:
-            return []
-        min_score = min(scores)
-        max_score = max(scores)
-        # Avoid division by zero
-        if max_score == min_score:
-            return [target_range[1]] * len(scores)
-        target_min, target_max = target_range
-        target_span = target_max - target_min
-        score_span = max_score - min_score
-        normalized = []
-        for score in scores:
-            normalized_score = target_min + (score - min_score) * target_span / score_span
-            normalized.append(max(target_min, min(target_max, normalized_score)))
-        return normalized
-    except Exception as e:
-        logger.error(f"Error normalizing scores: {str(e)}")
-        return scores
-def calculate_confidence_intervals(scores: list, confidence_level: float = 0.95) -> dict:
-    """
-    Calculate confidence intervals for a list of scores
-    Args:
-        scores: List of numerical scores
-        confidence_level: Confidence level (0-1)
-    Returns:
-        Dictionary with mean, std, lower_bound, upper_bound
-    """
-    try:
-        if not scores:
-            return {'mean': 0, 'std': 0, 'lower_bound': 0, 'upper_bound': 0}
-        mean_score = np.mean(scores)
-        std_score = np.std(scores)
-        # Calculate confidence interval using t-distribution
-        from scipy import stats
-        n = len(scores)
-        t_value = stats.t.ppf((1 + confidence_level) / 2, n - 1)
-        margin_error = t_value * std_score / np.sqrt(n)
-        return {
-            'mean': float(mean_score),
-            'std': float(std_score),
-            'lower_bound': float(mean_score - margin_error),
-            'upper_bound': float(mean_score + margin_error)
-        }
-    except Exception as e:
-        logger.error(f"Error calculating confidence intervals: {str(e)}")
-        return {'mean': 0, 'std': 0, 'lower_bound': 0, 'upper_bound': 0}
-def detect_outliers(scores: list, method: str = 'iqr') -> list:
-    """
-    Detect outliers in a list of scores
-    Args:
-        scores: List of numerical scores
-        method: Method to use ('iqr', 'zscore', 'modified_zscore')
-    Returns:
-        List of boolean values indicating outliers
-    """
-    try:
-        if not scores or len(scores) < 3:
-            return [False] * len(scores)
-        scores_array = np.array(scores)
-        if method == 'iqr':
-            # Interquartile Range method
-            q1 = np.percentile(scores_array, 25)
-            q3 = np.percentile(scores_array, 75)
-            iqr = q3 - q1
-            lower_bound = q1 - 1.5 * iqr
-            upper_bound = q3 + 1.5 * iqr
-            outliers = (scores_array < lower_bound) | (scores_array > upper_bound)
-        elif method == 'zscore':
-            # Z-score method
-            z_scores = np.abs(stats.zscore(scores_array))
-            outliers = z_scores > 2.5
-        elif method == 'modified_zscore':
-            # Modified Z-score method (more robust)
-            median = np.median(scores_array)
-            mad = np.median(np.abs(scores_array - median))
-            modified_z_scores = 0.6745 * (scores_array - median) / mad
-            outliers = np.abs(modified_z_scores) > 3.5
-        else:
-            outliers = [False] * len(scores)
-        return outliers.tolist()
-    except Exception as e:
-        logger.error(f"Error detecting outliers: {str(e)}")
-        return [False] * len(scores)
-def calculate_score_distribution(scores: list) -> dict:
-    """
-    Calculate distribution statistics for scores
-    Args:
-        scores: List of numerical scores
-    Returns:
-        Dictionary with distribution statistics
-    """
-    try:
-        if not scores:
-            return {}
-        scores_array = np.array(scores)
-        distribution = {
-            'count': len(scores),
-            'mean': float(np.mean(scores_array)),
-            'median': float(np.median(scores_array)),
-            'std': float(np.std(scores_array)),
-            'min': float(np.min(scores_array)),
-            'max': float(np.max(scores_array)),
-            'q1': float(np.percentile(scores_array, 25)),
-            'q3': float(np.percentile(scores_array, 75)),
-            'skewness': float(stats.skew(scores_array)),
-            'kurtosis': float(stats.kurtosis(scores_array))
-        }
-        return distribution
-    except Exception as e:
-        logger.error(f"Error calculating score distribution: {str(e)}")
-        return {}
-def apply_score_adjustments(
-    scores: dict,
-    adjustments: dict = None
-) -> dict:
-    """
-    Apply custom score adjustments based on specific criteria
-    Args:
-        scores: Dictionary of scores
-        adjustments: Dictionary of adjustment parameters
-    Returns:
-        Dictionary of adjusted scores
-    """
-    try:
-        if adjustments is None:
-            adjustments = {}
-        adjusted_scores = scores.copy()
-        # Apply anime mode adjustments
-        if adjustments.get('anime_mode', False):
-            # Boost aesthetics score for anime images
-            if 'aesthetics_score' in adjusted_scores:
-                adjusted_scores['aesthetics_score'] *= 1.1
-                adjusted_scores['aesthetics_score'] = min(10.0, adjusted_scores['aesthetics_score'])
-        # Apply quality penalties for low resolution
-        if adjustments.get('penalize_low_resolution', True):
-            width = adjustments.get('width', 1024)
-            height = adjustments.get('height', 1024)
-            total_pixels = width * height
-            if total_pixels < 262144:  # Less than 512x512
-                penalty = 0.8
-                if 'quality_score' in adjusted_scores:
-                    adjusted_scores['quality_score'] *= penalty
-        # Apply prompt complexity adjustments
-        prompt_length = adjustments.get('prompt_length', 0)
-        if prompt_length > 0 and 'prompt_score' in adjusted_scores:
-            if prompt_length > 100:  # Very long prompts are harder to follow
-                adjusted_scores['prompt_score'] *= 0.95
-            elif prompt_length < 10:  # Very short prompts are easier
-                adjusted_scores['prompt_score'] *= 1.05
-                adjusted_scores['prompt_score'] = min(10.0, adjusted_scores['prompt_score'])
-        return adjusted_scores
-    except Exception as e:
-        logger.error(f"Error applying score adjustments: {str(e)}")
-        return scores
-def generate_score_summary(results_list: list) -> dict:
-    """
-    Generate summary statistics for a batch of evaluation results
-    Args:
-        results_list: List of result dictionaries
-    Returns:
-        Dictionary with summary statistics
-    """
-    try:
-        if not results_list:
-            return {}
-        # Extract scores by category
-        categories = ['quality_score', 'aesthetics_score', 'prompt_score', 'ai_detection_score', 'final_score']
-        summary = {}
-        for category in categories:
-            if category in results_list[0]:
-                scores = [result[category] for result in results_list if category in result]
-                if scores:
-                    summary[category] = calculate_score_distribution(scores)
-        # Calculate overall statistics
-        final_scores = [result['final_score'] for result in results_list if 'final_score' in result]
-        if final_scores:
-            summary['overall'] = {
-                'total_images': len(results_list),
-                'average_score': np.mean(final_scores),
-                'best_score': max(final_scores),
-                'worst_score': min(final_scores),
-                'score_range': max(final_scores) - min(final_scores),
-                'images_with_prompts': sum(1 for r in results_list if r.get('has_prompt', False))
-            }
-        return summary
-    except Exception as e:
-        logger.error(f"Error generating score summary: {str(e)}")
-        return {}