Spaces:
Sleeping
Sleeping
Update fix.py
Browse files
fix.py
CHANGED
@@ -1,22 +1,22 @@
|
|
1 |
# fix.py
|
2 |
|
3 |
-
import
|
4 |
-
import
|
5 |
import json
|
6 |
import logging
|
7 |
-
import
|
8 |
-
|
9 |
-
import
|
10 |
import time
|
11 |
from datetime import datetime
|
12 |
-
import
|
13 |
-
import functools
|
14 |
|
15 |
-
from openai import AzureOpenAI
|
16 |
-
from supabase import create_client, Client
|
17 |
-
from tqdm import tqdm
|
18 |
from dotenv import load_dotenv
|
|
|
19 |
from ratelimiter import RateLimiter
|
|
|
|
|
|
|
20 |
|
21 |
# Set up logging with thread safety and custom formatting
|
22 |
class CustomFormatter(logging.Formatter):
|
@@ -81,7 +81,7 @@ SUPABASE_URL = os.getenv("SUPABASE_DB_URL")
|
|
81 |
SUPABASE_API_KEY = os.getenv("SUPABASE_API_KEY")
|
82 |
AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY")
|
83 |
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
|
84 |
-
AZURE_OPENAI_DEPLOYMENT_NAME = os.getenv("
|
85 |
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "2023-05-15")
|
86 |
|
87 |
# Validate environment variables
|
@@ -94,8 +94,6 @@ if not AZURE_OPENAI_KEY:
|
|
94 |
missing_vars.append("AZURE_OPENAI_KEY")
|
95 |
if not AZURE_OPENAI_ENDPOINT:
|
96 |
missing_vars.append("AZURE_OPENAI_ENDPOINT")
|
97 |
-
if not AZURE_OPENAI_DEPLOYMENT_NAME:
|
98 |
-
missing_vars.append("AZURE_OPENAI_DEPLOYMENT_NAME")
|
99 |
|
100 |
if missing_vars:
|
101 |
error_msg = f"Missing required environment variables: {', '.join(missing_vars)}"
|
@@ -166,7 +164,11 @@ def generate_fixed_content(row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
|
166 |
Returns a dictionary with fixed content or None if generation fails.
|
167 |
"""
|
168 |
try:
|
169 |
-
#
|
|
|
|
|
|
|
|
|
170 |
system_message = """You are an expert in standardized English test content. You must return your response as a valid JSON object with the following structure:
|
171 |
{
|
172 |
"reading_passage": "formatted passage text",
|
@@ -176,37 +178,35 @@ def generate_fixed_content(row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
|
176 |
"option_c": "option C text",
|
177 |
"option_d": "option D text",
|
178 |
"explanation": "explanation text"
|
179 |
-
}
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
4. Quality:
|
203 |
-
- Fix grammar and clarity issues
|
204 |
-
- Ensure proper organization
|
205 |
-
- Use clear, unambiguous language"""
|
206 |
|
207 |
# Create user message with the content to fix
|
208 |
user_message = f"""Please format and fix the following exam content, returning a JSON object with the specified structure:
|
209 |
|
|
|
|
|
210 |
Reading Passage:
|
211 |
{row.get('reading_passage', '')}
|
212 |
|
@@ -243,28 +243,35 @@ Explanation:
|
|
243 |
|
244 |
content = response.choices[0].message.content
|
245 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
246 |
try:
|
247 |
# Parse JSON response
|
248 |
fixed_data = json.loads(content)
|
249 |
|
250 |
-
#
|
251 |
-
|
252 |
-
|
|
|
|
|
|
|
|
|
|
|
253 |
|
254 |
-
if missing_fields:
|
255 |
-
logger.error(f"Missing or empty required fields: {', '.join(missing_fields)}")
|
256 |
-
return None
|
257 |
-
|
258 |
-
# Validate content length
|
259 |
-
short_fields = [field for field in required_fields if len(str(fixed_data.get(field, ''))) < 2]
|
260 |
-
if short_fields:
|
261 |
-
logger.error(f"Fields with insufficient content: {', '.join(short_fields)}")
|
262 |
-
return None
|
263 |
-
|
264 |
# Copy over unchanged fields
|
265 |
for key in row:
|
266 |
if key not in fixed_data and key != 'id':
|
267 |
fixed_data[key] = row[key]
|
|
|
|
|
|
|
268 |
|
269 |
return fixed_data
|
270 |
|
@@ -295,74 +302,59 @@ def clean_text(text: str) -> str:
|
|
295 |
|
296 |
def check_row_quality(row: Dict[str, Any]) -> bool:
|
297 |
"""
|
298 |
-
|
299 |
-
Returns True if the row is good, False if it needs fixing.
|
300 |
"""
|
301 |
-
# Skip if already fixed
|
302 |
-
if row.get('is_fixed'):
|
303 |
return True
|
304 |
|
305 |
-
#
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
|
|
|
|
|
|
311 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
312 |
|
313 |
-
#
|
314 |
-
for
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
# Check for valid exam type
|
320 |
-
if row['exam_type'] not in EXAM_TYPES:
|
321 |
-
return False
|
322 |
-
|
323 |
-
# Check for valid difficulty level
|
324 |
-
if row['difficulty_level'] not in DIFFICULTY_LEVELS:
|
325 |
-
return False
|
326 |
-
|
327 |
-
# Check for valid correct answer format
|
328 |
-
if not is_valid_correct_answer(row['correct_answer']):
|
329 |
-
return False
|
330 |
-
|
331 |
-
# Check for common OCR and formatting issues
|
332 |
-
text_fields = ['reading_passage', 'question_text', 'option_a', 'option_b', 'option_c', 'option_d', 'explanation']
|
333 |
-
for field in text_fields:
|
334 |
-
text = row.get(field, '')
|
335 |
-
if isinstance(text, str):
|
336 |
-
# Check for OCR artifacts
|
337 |
-
if any(artifact in text.lower() for artifact in [
|
338 |
-
'arebasedonthe', 'lineno', 'click here', 'seenext', 'seebelow',
|
339 |
-
'answerthefollowing', 'choosethebest', 'selectthe'
|
340 |
-
]):
|
341 |
-
return False
|
342 |
-
|
343 |
-
# Check for formatting issues
|
344 |
-
if text.count('.') > 20: # Too many periods might indicate formatting issues
|
345 |
-
return False
|
346 |
-
if text.count('\n') > 20: # Too many newlines might indicate formatting issues
|
347 |
-
return False
|
348 |
-
if len(text.split()) < 2: # Text should have at least 2 words
|
349 |
-
return False
|
350 |
-
|
351 |
-
# Check minimum length requirements
|
352 |
-
if len(row['reading_passage'].split()) < MIN_PASSAGE_WORDS:
|
353 |
-
return False
|
354 |
-
|
355 |
# Check for duplicate options
|
356 |
-
options
|
357 |
-
|
358 |
-
return
|
359 |
-
|
360 |
-
#
|
361 |
-
explanation = row
|
362 |
-
if len(explanation
|
363 |
-
|
364 |
-
|
365 |
-
return False
|
366 |
|
367 |
return True
|
368 |
|
@@ -388,77 +380,116 @@ def update_row_in_supabase(row_id: str, fixed_data: Dict[str, Any]) -> bool:
|
|
388 |
return False
|
389 |
|
390 |
def process_row(row: Dict[str, Any], progress_counter: AtomicCounter, total_rows: int, row_number: int) -> Dict[str, Any]:
|
391 |
-
"""Process a single row
|
392 |
-
result = {
|
393 |
-
'row_id': row.get('id'),
|
394 |
-
'success': False,
|
395 |
-
'message': '',
|
396 |
-
'changes_made': []
|
397 |
-
}
|
398 |
-
|
399 |
try:
|
400 |
row_id = row.get('id')
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
|
423 |
-
|
424 |
-
|
425 |
-
|
426 |
-
|
427 |
-
|
428 |
-
|
429 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
430 |
|
431 |
# Generate fixed content
|
432 |
fixed_data = generate_fixed_content(row)
|
433 |
if not fixed_data:
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
-
|
443 |
-
|
444 |
-
|
445 |
-
|
446 |
-
|
447 |
-
|
448 |
-
|
449 |
-
|
450 |
-
|
451 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
452 |
else:
|
453 |
-
|
454 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
455 |
|
456 |
except Exception as e:
|
457 |
-
|
458 |
-
|
459 |
-
|
460 |
-
|
461 |
-
|
|
|
462 |
|
463 |
def fetch_all_unfixed_rows(supabase_client: Client, batch_size: int = 1000):
|
464 |
"""
|
@@ -499,6 +530,7 @@ def main():
|
|
499 |
total_rows = 0
|
500 |
success_count = 0
|
501 |
failure_count = 0
|
|
|
502 |
changes_by_field = {
|
503 |
'reading_passage': 0,
|
504 |
'question_text': 0,
|
@@ -534,6 +566,9 @@ def main():
|
|
534 |
# Update changes counter
|
535 |
for field in result['changes_made']:
|
536 |
changes_by_field[field] = changes_by_field.get(field, 0) + 1
|
|
|
|
|
|
|
537 |
else:
|
538 |
failure_count += 1
|
539 |
pbar.update(1)
|
@@ -547,6 +582,7 @@ def main():
|
|
547 |
f"Total questions processed: {total_rows}",
|
548 |
f"Successful updates: {success_count}",
|
549 |
f"Failed updates: {failure_count}",
|
|
|
550 |
f"Execution time: {execution_time:.2f} seconds",
|
551 |
"\nChanges by field:",
|
552 |
*[f"- {field}: {count}" for field, count in changes_by_field.items() if count > 0],
|
|
|
1 |
# fix.py
|
2 |
|
3 |
+
import concurrent.futures
|
4 |
+
import functools
|
5 |
import json
|
6 |
import logging
|
7 |
+
import os
|
8 |
+
import re
|
9 |
+
import threading
|
10 |
import time
|
11 |
from datetime import datetime
|
12 |
+
from typing import Any, Dict, Optional
|
|
|
13 |
|
|
|
|
|
|
|
14 |
from dotenv import load_dotenv
|
15 |
+
from openai import AzureOpenAI
|
16 |
from ratelimiter import RateLimiter
|
17 |
+
from supabase import Client, create_client
|
18 |
+
from tqdm import tqdm
|
19 |
+
|
20 |
|
21 |
# Set up logging with thread safety and custom formatting
|
22 |
class CustomFormatter(logging.Formatter):
|
|
|
81 |
SUPABASE_API_KEY = os.getenv("SUPABASE_API_KEY")
|
82 |
AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY")
|
83 |
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
|
84 |
+
AZURE_OPENAI_DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME_FIX", "gpt-4o-mini") # Use specific deployment for fixing
|
85 |
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "2023-05-15")
|
86 |
|
87 |
# Validate environment variables
|
|
|
94 |
missing_vars.append("AZURE_OPENAI_KEY")
|
95 |
if not AZURE_OPENAI_ENDPOINT:
|
96 |
missing_vars.append("AZURE_OPENAI_ENDPOINT")
|
|
|
|
|
97 |
|
98 |
if missing_vars:
|
99 |
error_msg = f"Missing required environment variables: {', '.join(missing_vars)}"
|
|
|
164 |
Returns a dictionary with fixed content or None if generation fails.
|
165 |
"""
|
166 |
try:
|
167 |
+
# Determine if this is a math question
|
168 |
+
domain = row.get('domain', '').lower()
|
169 |
+
is_math = any(math_term in domain.lower() for math_term in ['math', 'algebra', 'geometry', 'calculus', 'arithmetic'])
|
170 |
+
|
171 |
+
# Create system message with domain-specific instructions
|
172 |
system_message = """You are an expert in standardized English test content. You must return your response as a valid JSON object with the following structure:
|
173 |
{
|
174 |
"reading_passage": "formatted passage text",
|
|
|
178 |
"option_c": "option C text",
|
179 |
"option_d": "option D text",
|
180 |
"explanation": "explanation text"
|
181 |
+
}"""
|
182 |
+
|
183 |
+
if is_math:
|
184 |
+
system_message += """
|
185 |
+
IMPORTANT: For ALL mathematics questions:
|
186 |
+
- You MUST set reading_passage to an empty string (""). No exceptions.
|
187 |
+
- Move any context or problem setup from the reading passage into the question_text
|
188 |
+
- The question_text should contain all necessary mathematical information
|
189 |
+
- Format: reading_passage must be "", question_text contains everything
|
190 |
+
|
191 |
+
Example math question format:
|
192 |
+
{
|
193 |
+
"reading_passage": "",
|
194 |
+
"question_text": "In the given system of equations, y = -1.5 and y = x^2 + 8x + a, where a is a positive constant. The system has exactly one distinct real solution. What is the value of a?",
|
195 |
+
...
|
196 |
+
}"""
|
197 |
+
else:
|
198 |
+
system_message += """
|
199 |
+
For reading comprehension questions:
|
200 |
+
- Format the reading_passage professionally with proper paragraphing
|
201 |
+
- Ensure the question is answerable from the passage
|
202 |
+
- Make answer options clear and distinct
|
203 |
+
- Reference the passage in the explanation"""
|
|
|
|
|
|
|
|
|
204 |
|
205 |
# Create user message with the content to fix
|
206 |
user_message = f"""Please format and fix the following exam content, returning a JSON object with the specified structure:
|
207 |
|
208 |
+
Domain: {domain}
|
209 |
+
|
210 |
Reading Passage:
|
211 |
{row.get('reading_passage', '')}
|
212 |
|
|
|
243 |
|
244 |
content = response.choices[0].message.content
|
245 |
|
246 |
+
# Calculate cost (gpt-4o-mini pricing)
|
247 |
+
input_tokens = (len(system_message) + len(user_message)) / 4 # Rough estimate: 4 chars per token
|
248 |
+
output_tokens = len(content) / 4
|
249 |
+
# gpt-4o-mini pricing:
|
250 |
+
# Input: $0.300 per 1M tokens
|
251 |
+
# Output: $1.200 per 1M tokens
|
252 |
+
fix_cost = (input_tokens / 1_000_000 * 0.300) + (output_tokens / 1_000_000 * 1.200)
|
253 |
+
logger.info(f"Estimated cost for fixing this question: ${fix_cost:.6f}")
|
254 |
+
|
255 |
try:
|
256 |
# Parse JSON response
|
257 |
fixed_data = json.loads(content)
|
258 |
|
259 |
+
# For math questions, ensure reading passage is empty
|
260 |
+
if is_math and fixed_data.get('reading_passage', '').strip():
|
261 |
+
# Move reading passage content to question text if needed
|
262 |
+
current_passage = fixed_data.get('reading_passage', '').strip()
|
263 |
+
current_question = fixed_data.get('question_text', '').strip()
|
264 |
+
if current_passage:
|
265 |
+
fixed_data['question_text'] = f"{current_passage} {current_question}"
|
266 |
+
fixed_data['reading_passage'] = ""
|
267 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
268 |
# Copy over unchanged fields
|
269 |
for key in row:
|
270 |
if key not in fixed_data and key != 'id':
|
271 |
fixed_data[key] = row[key]
|
272 |
+
|
273 |
+
# Add the fix cost to the data
|
274 |
+
fixed_data['fix_cost'] = fix_cost
|
275 |
|
276 |
return fixed_data
|
277 |
|
|
|
302 |
|
303 |
def check_row_quality(row: Dict[str, Any]) -> bool:
|
304 |
"""
|
305 |
+
Check if a row meets quality standards.
|
306 |
+
Returns True if the row is good quality, False if it needs fixing.
|
307 |
"""
|
308 |
+
# Skip if already marked as fixed
|
309 |
+
if row.get('is_fixed', False):
|
310 |
return True
|
311 |
|
312 |
+
# Check for image-related questions that should be deleted
|
313 |
+
question_text = row.get('question_text', '').lower()
|
314 |
+
reading_passage = row.get('reading_passage', '').lower()
|
315 |
+
|
316 |
+
# Keywords that indicate image-based questions
|
317 |
+
image_keywords = [
|
318 |
+
'image', 'picture', 'diagram', 'graph', 'figure', 'photo', 'illustration',
|
319 |
+
'shown', 'depicted', 'displayed', 'above', 'below', 'following figure',
|
320 |
+
'look at the', 'in this picture', 'as shown', 'pictured'
|
321 |
]
|
322 |
+
|
323 |
+
# Check if question or passage refers to images
|
324 |
+
if any(keyword in question_text for keyword in image_keywords) or \
|
325 |
+
any(keyword in reading_passage for keyword in image_keywords):
|
326 |
+
logger.info(f"Row {row.get('id')}: Marked for deletion - contains image references")
|
327 |
+
return None # Return None to indicate deletion
|
328 |
+
|
329 |
+
# Basic validation for required fields
|
330 |
+
if not row.get('question_text') or not row.get('explanation'):
|
331 |
+
logger.info(f"Row {row.get('id')}: Marked for deletion - missing required fields")
|
332 |
+
return None
|
333 |
+
|
334 |
+
if not all(row.get(f'option_{opt}') for opt in ['a', 'b', 'c', 'd']):
|
335 |
+
logger.info(f"Row {row.get('id')}: Marked for deletion - missing options")
|
336 |
+
return None
|
337 |
+
|
338 |
+
if not is_valid_correct_answer(row.get('correct_answer', '')):
|
339 |
+
logger.info(f"Row {row.get('id')}: Marked for deletion - invalid correct answer")
|
340 |
+
return None
|
341 |
|
342 |
+
# Option quality checks
|
343 |
+
options = [row.get(f'option_{opt}', '').strip() for opt in ['a', 'b', 'c', 'd']]
|
344 |
+
if any(len(opt) < 1 for opt in options):
|
345 |
+
logger.info(f"Row {row.get('id')}: Marked for deletion - empty options")
|
346 |
+
return None
|
347 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
348 |
# Check for duplicate options
|
349 |
+
if len(set(options)) != 4:
|
350 |
+
logger.info(f"Row {row.get('id')}: Marked for deletion - duplicate options")
|
351 |
+
return None
|
352 |
+
|
353 |
+
# Basic explanation quality check
|
354 |
+
explanation = row.get('explanation', '')
|
355 |
+
if len(explanation) < 50 or not explanation.strip():
|
356 |
+
logger.info(f"Row {row.get('id')}: Marked for deletion - insufficient explanation")
|
357 |
+
return None
|
|
|
358 |
|
359 |
return True
|
360 |
|
|
|
380 |
return False
|
381 |
|
382 |
def process_row(row: Dict[str, Any], progress_counter: AtomicCounter, total_rows: int, row_number: int) -> Dict[str, Any]:
|
383 |
+
"""Process a single row and return the result."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
384 |
try:
|
385 |
row_id = row.get('id')
|
386 |
+
|
387 |
+
# Check quality first
|
388 |
+
quality_check = check_row_quality(row)
|
389 |
+
|
390 |
+
# If quality_check is None, delete the row
|
391 |
+
if quality_check is None:
|
392 |
+
try:
|
393 |
+
supabase.table("exam_contents").delete().eq("id", row_id).execute()
|
394 |
+
logger.info(f"Row {row_id}: Deleted due to quality issues.")
|
395 |
+
return {
|
396 |
+
'success': True,
|
397 |
+
'changes_made': ['deleted'],
|
398 |
+
'row_id': row_id,
|
399 |
+
'cost': 0.0
|
400 |
+
}
|
401 |
+
except Exception as e:
|
402 |
+
logger.error(f"Row {row_id}: Failed to delete - {str(e)}")
|
403 |
+
return {
|
404 |
+
'success': False,
|
405 |
+
'row_id': row_id,
|
406 |
+
'cost': 0.0
|
407 |
+
}
|
408 |
+
|
409 |
+
# If row passes quality check, no need to fix
|
410 |
+
if quality_check is True:
|
411 |
+
# Update is_fixed flag
|
412 |
+
try:
|
413 |
+
supabase.table("exam_contents").update({"is_fixed": True}).eq("id", row_id).execute()
|
414 |
+
logger.info(f"Row {row_id}: Already good quality. Marked as fixed.")
|
415 |
+
return {
|
416 |
+
'success': True,
|
417 |
+
'changes_made': ['marked_fixed'],
|
418 |
+
'row_id': row_id,
|
419 |
+
'cost': 0.0
|
420 |
+
}
|
421 |
+
except Exception as e:
|
422 |
+
logger.error(f"Row {row_id}: Failed to update fixed status - {str(e)}")
|
423 |
+
return {
|
424 |
+
'success': False,
|
425 |
+
'row_id': row_id,
|
426 |
+
'cost': 0.0
|
427 |
+
}
|
428 |
|
429 |
# Generate fixed content
|
430 |
fixed_data = generate_fixed_content(row)
|
431 |
if not fixed_data:
|
432 |
+
logger.error(f"Row {row_id}: Failed to generate fixed content.")
|
433 |
+
return {
|
434 |
+
'success': False,
|
435 |
+
'row_id': row_id,
|
436 |
+
'cost': 0.0
|
437 |
+
}
|
438 |
+
|
439 |
+
# Track what fields were modified
|
440 |
+
changes_made = []
|
441 |
+
for field in fixed_data:
|
442 |
+
if field in row and fixed_data[field] != row[field]:
|
443 |
+
changes_made.append(field)
|
444 |
+
|
445 |
+
if changes_made:
|
446 |
+
# Add is_fixed flag
|
447 |
+
fixed_data['is_fixed'] = True
|
448 |
+
|
449 |
+
# Update in database
|
450 |
+
try:
|
451 |
+
supabase.table("exam_contents").update(fixed_data).eq("id", row_id).execute()
|
452 |
+
change_list = ', '.join(changes_made)
|
453 |
+
logger.info(f"Row {row_id}: Fixed successfully. Modified: {change_list}")
|
454 |
+
return {
|
455 |
+
'success': True,
|
456 |
+
'changes_made': changes_made,
|
457 |
+
'row_id': row_id,
|
458 |
+
'cost': fixed_data.get('fix_cost', 0.0) # Include the fix cost
|
459 |
+
}
|
460 |
+
except Exception as e:
|
461 |
+
logger.error(f"Row {row_id}: Failed to update - {str(e)}")
|
462 |
+
return {
|
463 |
+
'success': False,
|
464 |
+
'row_id': row_id,
|
465 |
+
'cost': 0.0
|
466 |
+
}
|
467 |
else:
|
468 |
+
# No changes needed, just mark as fixed
|
469 |
+
try:
|
470 |
+
supabase.table("exam_contents").update({"is_fixed": True}).eq("id", row_id).execute()
|
471 |
+
logger.info(f"Row {row_id}: Fixed successfully. Modified: No changes needed")
|
472 |
+
return {
|
473 |
+
'success': True,
|
474 |
+
'changes_made': ['marked_fixed'],
|
475 |
+
'row_id': row_id,
|
476 |
+
'cost': fixed_data.get('fix_cost', 0.0) # Include the fix cost even if no changes
|
477 |
+
}
|
478 |
+
except Exception as e:
|
479 |
+
logger.error(f"Row {row_id}: Failed to update fixed status - {str(e)}")
|
480 |
+
return {
|
481 |
+
'success': False,
|
482 |
+
'row_id': row_id,
|
483 |
+
'cost': 0.0
|
484 |
+
}
|
485 |
|
486 |
except Exception as e:
|
487 |
+
logger.error(f"Error processing row {row.get('id', 'unknown')}: {str(e)}")
|
488 |
+
return {
|
489 |
+
'success': False,
|
490 |
+
'row_id': row.get('id', 'unknown'),
|
491 |
+
'cost': 0.0
|
492 |
+
}
|
493 |
|
494 |
def fetch_all_unfixed_rows(supabase_client: Client, batch_size: int = 1000):
|
495 |
"""
|
|
|
530 |
total_rows = 0
|
531 |
success_count = 0
|
532 |
failure_count = 0
|
533 |
+
total_cost = 0.0
|
534 |
changes_by_field = {
|
535 |
'reading_passage': 0,
|
536 |
'question_text': 0,
|
|
|
566 |
# Update changes counter
|
567 |
for field in result['changes_made']:
|
568 |
changes_by_field[field] = changes_by_field.get(field, 0) + 1
|
569 |
+
# Add cost if available
|
570 |
+
if 'cost' in result:
|
571 |
+
total_cost += result['cost']
|
572 |
else:
|
573 |
failure_count += 1
|
574 |
pbar.update(1)
|
|
|
582 |
f"Total questions processed: {total_rows}",
|
583 |
f"Successful updates: {success_count}",
|
584 |
f"Failed updates: {failure_count}",
|
585 |
+
f"Total cost: ${total_cost:.6f}",
|
586 |
f"Execution time: {execution_time:.2f} seconds",
|
587 |
"\nChanges by field:",
|
588 |
*[f"- {field}: {count}" for field, count in changes_by_field.items() if count > 0],
|