VOIDER commited on
Commit
fef69d7
·
verified ·
1 Parent(s): 51aec78

Delete utils/scoring.py

Browse files
Files changed (1) hide show
  1. utils/scoring.py +0 -359
utils/scoring.py DELETED
@@ -1,359 +0,0 @@
1
- import numpy as np
2
- import logging
3
-
4
- logger = logging.getLogger(__name__)
5
-
6
- def calculate_final_score(
7
- quality_score: float,
8
- aesthetics_score: float,
9
- prompt_score: float,
10
- ai_detection_score: float,
11
- has_prompt: bool = True
12
- ) -> float:
13
- """
14
- Calculate weighted composite score for image evaluation
15
-
16
- Args:
17
- quality_score: Technical image quality (0-10)
18
- aesthetics_score: Visual appeal score (0-10)
19
- prompt_score: Prompt adherence score (0-10)
20
- ai_detection_score: AI generation probability (0-1)
21
- has_prompt: Whether prompt metadata is available
22
-
23
- Returns:
24
- Final composite score (0-10)
25
- """
26
- try:
27
- # Validate input scores
28
- quality_score = max(0.0, min(10.0, quality_score))
29
- aesthetics_score = max(0.0, min(10.0, aesthetics_score))
30
- prompt_score = max(0.0, min(10.0, prompt_score))
31
- ai_detection_score = max(0.0, min(1.0, ai_detection_score))
32
-
33
- if has_prompt:
34
- # Standard weights when prompt is available
35
- weights = {
36
- 'quality': 0.25, # 25% - Technical quality
37
- 'aesthetics': 0.35, # 35% - Visual appeal (highest weight)
38
- 'prompt': 0.25, # 25% - Prompt following
39
- 'ai_detection': 0.15 # 15% - AI detection (inverted)
40
- }
41
-
42
- # Calculate weighted score
43
- score = (
44
- quality_score * weights['quality'] +
45
- aesthetics_score * weights['aesthetics'] +
46
- prompt_score * weights['prompt'] +
47
- (1 - ai_detection_score) * weights['ai_detection']
48
- )
49
- else:
50
- # Redistribute prompt weight when no prompt available
51
- weights = {
52
- 'quality': 0.375, # 25% + 12.5% from prompt
53
- 'aesthetics': 0.475, # 35% + 12.5% from prompt
54
- 'ai_detection': 0.15 # 15% - AI detection (inverted)
55
- }
56
-
57
- # Calculate weighted score without prompt
58
- score = (
59
- quality_score * weights['quality'] +
60
- aesthetics_score * weights['aesthetics'] +
61
- (1 - ai_detection_score) * weights['ai_detection']
62
- )
63
-
64
- # Ensure score is in valid range
65
- final_score = max(0.0, min(10.0, score))
66
-
67
- logger.debug(f"Score calculation - Quality: {quality_score:.2f}, "
68
- f"Aesthetics: {aesthetics_score:.2f}, Prompt: {prompt_score:.2f}, "
69
- f"AI Detection: {ai_detection_score:.3f}, Has Prompt: {has_prompt}, "
70
- f"Final: {final_score:.2f}")
71
-
72
- return final_score
73
-
74
- except Exception as e:
75
- logger.error(f"Error calculating final score: {str(e)}")
76
- return 5.0 # Default neutral score
77
-
78
- def calculate_category_rankings(scores_list: list, category: str) -> list:
79
- """
80
- Calculate rankings for a specific category
81
-
82
- Args:
83
- scores_list: List of score dictionaries
84
- category: Category to rank by ('quality_score', 'aesthetics_score', etc.)
85
-
86
- Returns:
87
- List of rankings (1-based)
88
- """
89
- try:
90
- if not scores_list or category not in scores_list[0]:
91
- return [1] * len(scores_list)
92
-
93
- # Extract scores for the category
94
- category_scores = [item[category] for item in scores_list]
95
-
96
- # Calculate rankings (higher score = better rank)
97
- rankings = []
98
- for i, score in enumerate(category_scores):
99
- rank = 1
100
- for j, other_score in enumerate(category_scores):
101
- if other_score > score:
102
- rank += 1
103
- rankings.append(rank)
104
-
105
- return rankings
106
-
107
- except Exception as e:
108
- logger.error(f"Error calculating category rankings: {str(e)}")
109
- return list(range(1, len(scores_list) + 1))
110
-
111
- def normalize_scores(scores: list, target_range: tuple = (0, 10)) -> list:
112
- """
113
- Normalize a list of scores to a target range
114
-
115
- Args:
116
- scores: List of numerical scores
117
- target_range: Tuple of (min, max) for target range
118
-
119
- Returns:
120
- List of normalized scores
121
- """
122
- try:
123
- if not scores:
124
- return []
125
-
126
- min_score = min(scores)
127
- max_score = max(scores)
128
-
129
- # Avoid division by zero
130
- if max_score == min_score:
131
- return [target_range[1]] * len(scores)
132
-
133
- target_min, target_max = target_range
134
- target_span = target_max - target_min
135
- score_span = max_score - min_score
136
-
137
- normalized = []
138
- for score in scores:
139
- normalized_score = target_min + (score - min_score) * target_span / score_span
140
- normalized.append(max(target_min, min(target_max, normalized_score)))
141
-
142
- return normalized
143
-
144
- except Exception as e:
145
- logger.error(f"Error normalizing scores: {str(e)}")
146
- return scores
147
-
148
- def calculate_confidence_intervals(scores: list, confidence_level: float = 0.95) -> dict:
149
- """
150
- Calculate confidence intervals for a list of scores
151
-
152
- Args:
153
- scores: List of numerical scores
154
- confidence_level: Confidence level (0-1)
155
-
156
- Returns:
157
- Dictionary with mean, std, lower_bound, upper_bound
158
- """
159
- try:
160
- if not scores:
161
- return {'mean': 0, 'std': 0, 'lower_bound': 0, 'upper_bound': 0}
162
-
163
- mean_score = np.mean(scores)
164
- std_score = np.std(scores)
165
-
166
- # Calculate confidence interval using t-distribution
167
- from scipy import stats
168
- n = len(scores)
169
- t_value = stats.t.ppf((1 + confidence_level) / 2, n - 1)
170
- margin_error = t_value * std_score / np.sqrt(n)
171
-
172
- return {
173
- 'mean': float(mean_score),
174
- 'std': float(std_score),
175
- 'lower_bound': float(mean_score - margin_error),
176
- 'upper_bound': float(mean_score + margin_error)
177
- }
178
-
179
- except Exception as e:
180
- logger.error(f"Error calculating confidence intervals: {str(e)}")
181
- return {'mean': 0, 'std': 0, 'lower_bound': 0, 'upper_bound': 0}
182
-
183
- def detect_outliers(scores: list, method: str = 'iqr') -> list:
184
- """
185
- Detect outliers in a list of scores
186
-
187
- Args:
188
- scores: List of numerical scores
189
- method: Method to use ('iqr', 'zscore', 'modified_zscore')
190
-
191
- Returns:
192
- List of boolean values indicating outliers
193
- """
194
- try:
195
- if not scores or len(scores) < 3:
196
- return [False] * len(scores)
197
-
198
- scores_array = np.array(scores)
199
-
200
- if method == 'iqr':
201
- # Interquartile Range method
202
- q1 = np.percentile(scores_array, 25)
203
- q3 = np.percentile(scores_array, 75)
204
- iqr = q3 - q1
205
- lower_bound = q1 - 1.5 * iqr
206
- upper_bound = q3 + 1.5 * iqr
207
- outliers = (scores_array < lower_bound) | (scores_array > upper_bound)
208
-
209
- elif method == 'zscore':
210
- # Z-score method
211
- z_scores = np.abs(stats.zscore(scores_array))
212
- outliers = z_scores > 2.5
213
-
214
- elif method == 'modified_zscore':
215
- # Modified Z-score method (more robust)
216
- median = np.median(scores_array)
217
- mad = np.median(np.abs(scores_array - median))
218
- modified_z_scores = 0.6745 * (scores_array - median) / mad
219
- outliers = np.abs(modified_z_scores) > 3.5
220
-
221
- else:
222
- outliers = [False] * len(scores)
223
-
224
- return outliers.tolist()
225
-
226
- except Exception as e:
227
- logger.error(f"Error detecting outliers: {str(e)}")
228
- return [False] * len(scores)
229
-
230
- def calculate_score_distribution(scores: list) -> dict:
231
- """
232
- Calculate distribution statistics for scores
233
-
234
- Args:
235
- scores: List of numerical scores
236
-
237
- Returns:
238
- Dictionary with distribution statistics
239
- """
240
- try:
241
- if not scores:
242
- return {}
243
-
244
- scores_array = np.array(scores)
245
-
246
- distribution = {
247
- 'count': len(scores),
248
- 'mean': float(np.mean(scores_array)),
249
- 'median': float(np.median(scores_array)),
250
- 'std': float(np.std(scores_array)),
251
- 'min': float(np.min(scores_array)),
252
- 'max': float(np.max(scores_array)),
253
- 'q1': float(np.percentile(scores_array, 25)),
254
- 'q3': float(np.percentile(scores_array, 75)),
255
- 'skewness': float(stats.skew(scores_array)),
256
- 'kurtosis': float(stats.kurtosis(scores_array))
257
- }
258
-
259
- return distribution
260
-
261
- except Exception as e:
262
- logger.error(f"Error calculating score distribution: {str(e)}")
263
- return {}
264
-
265
- def apply_score_adjustments(
266
- scores: dict,
267
- adjustments: dict = None
268
- ) -> dict:
269
- """
270
- Apply custom score adjustments based on specific criteria
271
-
272
- Args:
273
- scores: Dictionary of scores
274
- adjustments: Dictionary of adjustment parameters
275
-
276
- Returns:
277
- Dictionary of adjusted scores
278
- """
279
- try:
280
- if adjustments is None:
281
- adjustments = {}
282
-
283
- adjusted_scores = scores.copy()
284
-
285
- # Apply anime mode adjustments
286
- if adjustments.get('anime_mode', False):
287
- # Boost aesthetics score for anime images
288
- if 'aesthetics_score' in adjusted_scores:
289
- adjusted_scores['aesthetics_score'] *= 1.1
290
- adjusted_scores['aesthetics_score'] = min(10.0, adjusted_scores['aesthetics_score'])
291
-
292
- # Apply quality penalties for low resolution
293
- if adjustments.get('penalize_low_resolution', True):
294
- width = adjustments.get('width', 1024)
295
- height = adjustments.get('height', 1024)
296
- total_pixels = width * height
297
-
298
- if total_pixels < 262144: # Less than 512x512
299
- penalty = 0.8
300
- if 'quality_score' in adjusted_scores:
301
- adjusted_scores['quality_score'] *= penalty
302
-
303
- # Apply prompt complexity adjustments
304
- prompt_length = adjustments.get('prompt_length', 0)
305
- if prompt_length > 0 and 'prompt_score' in adjusted_scores:
306
- if prompt_length > 100: # Very long prompts are harder to follow
307
- adjusted_scores['prompt_score'] *= 0.95
308
- elif prompt_length < 10: # Very short prompts are easier
309
- adjusted_scores['prompt_score'] *= 1.05
310
- adjusted_scores['prompt_score'] = min(10.0, adjusted_scores['prompt_score'])
311
-
312
- return adjusted_scores
313
-
314
- except Exception as e:
315
- logger.error(f"Error applying score adjustments: {str(e)}")
316
- return scores
317
-
318
- def generate_score_summary(results_list: list) -> dict:
319
- """
320
- Generate summary statistics for a batch of evaluation results
321
-
322
- Args:
323
- results_list: List of result dictionaries
324
-
325
- Returns:
326
- Dictionary with summary statistics
327
- """
328
- try:
329
- if not results_list:
330
- return {}
331
-
332
- # Extract scores by category
333
- categories = ['quality_score', 'aesthetics_score', 'prompt_score', 'ai_detection_score', 'final_score']
334
- summary = {}
335
-
336
- for category in categories:
337
- if category in results_list[0]:
338
- scores = [result[category] for result in results_list if category in result]
339
- if scores:
340
- summary[category] = calculate_score_distribution(scores)
341
-
342
- # Calculate overall statistics
343
- final_scores = [result['final_score'] for result in results_list if 'final_score' in result]
344
- if final_scores:
345
- summary['overall'] = {
346
- 'total_images': len(results_list),
347
- 'average_score': np.mean(final_scores),
348
- 'best_score': max(final_scores),
349
- 'worst_score': min(final_scores),
350
- 'score_range': max(final_scores) - min(final_scores),
351
- 'images_with_prompts': sum(1 for r in results_list if r.get('has_prompt', False))
352
- }
353
-
354
- return summary
355
-
356
- except Exception as e:
357
- logger.error(f"Error generating score summary: {str(e)}")
358
- return {}
359
-