Spaces:

poemsforaphrodite
/

rag_ielts

Sleeping

App Files Files Community

poemsforaphrodite commited on Feb 4

Commit

dfe0427

verified ·

1 Parent(s): 49d979b

Update fix.py

Browse files

Files changed (1) hide show

fix.py +214 -178

fix.py CHANGED Viewed

@@ -1,22 +1,22 @@
 # fix.py
-import os
-import re
 import json
 import logging
-import concurrent.futures
-from typing import Dict, Any, Optional, List
-import queue
 import time
 from datetime import datetime
-import threading
-import functools
-from openai import AzureOpenAI
-from supabase import create_client, Client
-from tqdm import tqdm
 from dotenv import load_dotenv
 from ratelimiter import RateLimiter
 # Set up logging with thread safety and custom formatting
 class CustomFormatter(logging.Formatter):
@@ -81,7 +81,7 @@ SUPABASE_URL = os.getenv("SUPABASE_DB_URL")
 SUPABASE_API_KEY = os.getenv("SUPABASE_API_KEY")
 AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY")
 AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
-AZURE_OPENAI_DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")
 AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "2023-05-15")
 # Validate environment variables
@@ -94,8 +94,6 @@ if not AZURE_OPENAI_KEY:
     missing_vars.append("AZURE_OPENAI_KEY")
 if not AZURE_OPENAI_ENDPOINT:
     missing_vars.append("AZURE_OPENAI_ENDPOINT")
-if not AZURE_OPENAI_DEPLOYMENT_NAME:
-    missing_vars.append("AZURE_OPENAI_DEPLOYMENT_NAME")
 if missing_vars:
     error_msg = f"Missing required environment variables: {', '.join(missing_vars)}"
@@ -166,7 +164,11 @@ def generate_fixed_content(row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
     Returns a dictionary with fixed content or None if generation fails.
     """
     try:
-        # Create system message with instructions for JSON output
         system_message = """You are an expert in standardized English test content. You must return your response as a valid JSON object with the following structure:
 {
     "reading_passage": "formatted passage text",
@@ -176,37 +178,35 @@ def generate_fixed_content(row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
     "option_c": "option C text",
     "option_d": "option D text",
     "explanation": "explanation text"
-}
-Follow these guidelines when formatting the content:
-1. Reading Passage:
-   - Present in clean, professional format with proper paragraphing
-   - Keep academic language and formal tone
-   - Preserve all factual information
-   - Remove any question numbers or markers
-   - Format titles and headers appropriately
-2. Questions:
-   - Must be answerable solely from the passage
-   - Test key concepts, details, or arguments from the text
-   - Follow logical progression through the text
-   - No external knowledge required
-   - Test different reading skills (main idea, inference, detail)
-3. Answer Options:
-   - Must relate directly to the passage
-   - Only one option should be correct based on passage
-   - Other options should be plausible but clearly incorrect
-   - Options should have similar length and structure
-4. Quality:
-   - Fix grammar and clarity issues
-   - Ensure proper organization
-   - Use clear, unambiguous language"""
         # Create user message with the content to fix
         user_message = f"""Please format and fix the following exam content, returning a JSON object with the specified structure:
 Reading Passage:
 {row.get('reading_passage', '')}
@@ -243,28 +243,35 @@ Explanation:
         content = response.choices[0].message.content
         try:
             # Parse JSON response
             fixed_data = json.loads(content)
-            # Validate required fields
-            required_fields = ['reading_passage', 'question_text', 'option_a', 'option_b', 'option_c', 'option_d', 'explanation']
-            missing_fields = [field for field in required_fields if field not in fixed_data or not fixed_data[field]]
-            if missing_fields:
-                logger.error(f"Missing or empty required fields: {', '.join(missing_fields)}")
-                return None
-            # Validate content length
-            short_fields = [field for field in required_fields if len(str(fixed_data.get(field, ''))) < 2]
-            if short_fields:
-                logger.error(f"Fields with insufficient content: {', '.join(short_fields)}")
-                return None
             # Copy over unchanged fields
             for key in row:
                 if key not in fixed_data and key != 'id':
                     fixed_data[key] = row[key]
             return fixed_data
@@ -295,74 +302,59 @@ def clean_text(text: str) -> str:
 def check_row_quality(row: Dict[str, Any]) -> bool:
     """
-    Checks if the row has good quality data according to exam standards.
-    Returns True if the row is good, False if it needs fixing.
     """
-    # Skip if already fixed
-    if row.get('is_fixed'):
         return True
-    # Required fields must be present and non-empty
-    required_fields = [
-        'exam_type', 'content_type', 'exam_section', 'domain', 'subdomain',
-        'topic', 'difficulty_level', 'reading_passage', 'question_text',
-        'option_a', 'option_b', 'option_c', 'option_d', 'correct_answer',
-        'explanation'
     ]
-    # Check for missing or empty required fields
-    for field in required_fields:
-        value = row.get(field, '').strip() if isinstance(row.get(field), str) else row.get(field)
-        if not value:
-            return False
-    # Check for valid exam type
-    if row['exam_type'] not in EXAM_TYPES:
-        return False
-    # Check for valid difficulty level
-    if row['difficulty_level'] not in DIFFICULTY_LEVELS:
-        return False
-    # Check for valid correct answer format
-    if not is_valid_correct_answer(row['correct_answer']):
-        return False
-    # Check for common OCR and formatting issues
-    text_fields = ['reading_passage', 'question_text', 'option_a', 'option_b', 'option_c', 'option_d', 'explanation']
-    for field in text_fields:
-        text = row.get(field, '')
-        if isinstance(text, str):
-            # Check for OCR artifacts
-            if any(artifact in text.lower() for artifact in [
-                'arebasedonthe', 'lineno', 'click here', 'seenext', 'seebelow',
-                'answerthefollowing', 'choosethebest', 'selectthe'
-            ]):
-                return False
-            # Check for formatting issues
-            if text.count('.') > 20:  # Too many periods might indicate formatting issues
-                return False
-            if text.count('\n') > 20:  # Too many newlines might indicate formatting issues
-                return False
-            if len(text.split()) < 2:  # Text should have at least 2 words
-                return False
-    # Check minimum length requirements
-    if len(row['reading_passage'].split()) < MIN_PASSAGE_WORDS:
-        return False
     # Check for duplicate options
-    options = [row['option_a'], row['option_b'], row['option_c'], row['option_d']]
-    if len(set(options)) != len(options):
-        return False
-    # Check for explanation quality
-    explanation = row['explanation']
-    if len(explanation.split()) < 10:  # Explanation should be reasonably detailed
-        return False
-    if not any(word in explanation.lower() for word in ['because', 'since', 'as', 'therefore', 'thus', 'hence']):
-        return False
     return True
@@ -388,77 +380,116 @@ def update_row_in_supabase(row_id: str, fixed_data: Dict[str, Any]) -> bool:
         return False
 def process_row(row: Dict[str, Any], progress_counter: AtomicCounter, total_rows: int, row_number: int) -> Dict[str, Any]:
-    """Process a single row with progress tracking."""
-    result = {
-        'row_id': row.get('id'),
-        'success': False,
-        'message': '',
-        'changes_made': []
-    }
     try:
         row_id = row.get('id')
-        if not row_id:
-            result['message'] = "Row without ID found"
-            logger.warning(f"Row {row_number}: {result['message']}")
-            return result
-        # Check initial quality
-        initial_quality_issues = []
-        if not row.get('reading_passage'):
-            initial_quality_issues.append("Missing reading passage")
-        if not row.get('question_text'):
-            initial_quality_issues.append("Missing question text")
-        if not all(row.get(f'option_{opt}') for opt in ['a', 'b', 'c', 'd']):
-            initial_quality_issues.append("Missing options")
-        if not row.get('correct_answer'):
-            initial_quality_issues.append("Missing correct answer")
-        if initial_quality_issues:
-            logger.info(f"Row {row_number}: Quality issues found - {', '.join(initial_quality_issues)}")
-        if check_row_quality(row):
-            success = update_row_in_supabase(row_id, {'is_fixed': True})
-            result['success'] = success
-            result['message'] = "Already good quality, marked as fixed"
-            if success:
-                logger.info(f"Row {row_number}: Already good quality. Marked as fixed.")
-            else:
-                logger.error(f"Row {row_number}: Failed to mark as fixed.")
-            progress_counter.increment()
-            return result
         # Generate fixed content
         fixed_data = generate_fixed_content(row)
         if not fixed_data:
-            result['message'] = "Failed to fix content"
-            logger.error(f"Row {row_number}: Failed to generate fixed content.")
-            progress_counter.increment()
-            return result
-        # Compare changes
-        for field in ['reading_passage', 'question_text', 'option_a', 'option_b', 'option_c', 'option_d', 'explanation']:
-            if fixed_data.get(field) != row.get(field):
-                result['changes_made'].append(field)
-        fixed_data['is_fixed'] = True
-        success = update_row_in_supabase(row_id, fixed_data)
-        result['success'] = success
-        if success:
-            changes = ', '.join(result['changes_made']) if result['changes_made'] else 'No changes needed'
-            result['message'] = f"Fixed successfully. Changes in: {changes}"
-            logger.info(f"Row {row_number}: Fixed successfully. Modified: {changes}")
         else:
-            result['message'] = "Failed to update after fixing"
-            logger.error(f"Row {row_number}: Failed to update after fixing.")
     except Exception as e:
-        result['message'] = f"Error: {str(e)}"
-        logger.error(f"Row {row_number}: Error processing - {str(e)}")
-    progress_counter.increment()
-    return result
 def fetch_all_unfixed_rows(supabase_client: Client, batch_size: int = 1000):
     """
@@ -499,6 +530,7 @@ def main():
         total_rows = 0
         success_count = 0
         failure_count = 0
         changes_by_field = {
             'reading_passage': 0,
             'question_text': 0,
@@ -534,6 +566,9 @@ def main():
                         # Update changes counter
                         for field in result['changes_made']:
                             changes_by_field[field] = changes_by_field.get(field, 0) + 1
                     else:
                         failure_count += 1
                     pbar.update(1)
@@ -547,6 +582,7 @@ def main():
             f"Total questions processed: {total_rows}",
             f"Successful updates: {success_count}",
             f"Failed updates: {failure_count}",
             f"Execution time: {execution_time:.2f} seconds",
             "\nChanges by field:",
             *[f"- {field}: {count}" for field, count in changes_by_field.items() if count > 0],

 # fix.py
+import concurrent.futures
+import functools
 import json
 import logging
+import os
+import re
+import threading
 import time
 from datetime import datetime
+from typing import Any, Dict, Optional
 from dotenv import load_dotenv
+from openai import AzureOpenAI
 from ratelimiter import RateLimiter
+from supabase import Client, create_client
+from tqdm import tqdm
 # Set up logging with thread safety and custom formatting
 class CustomFormatter(logging.Formatter):
 SUPABASE_API_KEY = os.getenv("SUPABASE_API_KEY")
 AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY")
 AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
+AZURE_OPENAI_DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME_FIX", "gpt-4o-mini")  # Use specific deployment for fixing
 AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "2023-05-15")
 # Validate environment variables
     missing_vars.append("AZURE_OPENAI_KEY")
 if not AZURE_OPENAI_ENDPOINT:
     missing_vars.append("AZURE_OPENAI_ENDPOINT")
 if missing_vars:
     error_msg = f"Missing required environment variables: {', '.join(missing_vars)}"
     Returns a dictionary with fixed content or None if generation fails.
     """
     try:
+        # Determine if this is a math question
+        domain = row.get('domain', '').lower()
+        is_math = any(math_term in domain.lower() for math_term in ['math', 'algebra', 'geometry', 'calculus', 'arithmetic'])
+        # Create system message with domain-specific instructions
         system_message = """You are an expert in standardized English test content. You must return your response as a valid JSON object with the following structure:
 {
     "reading_passage": "formatted passage text",
     "option_c": "option C text",
     "option_d": "option D text",
     "explanation": "explanation text"
+}"""
+        if is_math:
+            system_message += """
+IMPORTANT: For ALL mathematics questions:
+- You MUST set reading_passage to an empty string (""). No exceptions.
+- Move any context or problem setup from the reading passage into the question_text
+- The question_text should contain all necessary mathematical information
+- Format: reading_passage must be "", question_text contains everything
+Example math question format:
+{
+    "reading_passage": "",
+    "question_text": "In the given system of equations, y = -1.5 and y = x^2 + 8x + a, where a is a positive constant. The system has exactly one distinct real solution. What is the value of a?",
+    ...
+}"""
+        else:
+            system_message += """
+For reading comprehension questions:
+- Format the reading_passage professionally with proper paragraphing
+- Ensure the question is answerable from the passage
+- Make answer options clear and distinct
+- Reference the passage in the explanation"""
         # Create user message with the content to fix
         user_message = f"""Please format and fix the following exam content, returning a JSON object with the specified structure:
+Domain: {domain}
 Reading Passage:
 {row.get('reading_passage', '')}
         content = response.choices[0].message.content
+        # Calculate cost (gpt-4o-mini pricing)
+        input_tokens = (len(system_message) + len(user_message)) / 4  # Rough estimate: 4 chars per token
+        output_tokens = len(content) / 4
+        # gpt-4o-mini pricing:
+        # Input: $0.300 per 1M tokens
+        # Output: $1.200 per 1M tokens
+        fix_cost = (input_tokens / 1_000_000 * 0.300) + (output_tokens / 1_000_000 * 1.200)
+        logger.info(f"Estimated cost for fixing this question: ${fix_cost:.6f}")
         try:
             # Parse JSON response
             fixed_data = json.loads(content)
+            # For math questions, ensure reading passage is empty
+            if is_math and fixed_data.get('reading_passage', '').strip():
+                # Move reading passage content to question text if needed
+                current_passage = fixed_data.get('reading_passage', '').strip()
+                current_question = fixed_data.get('question_text', '').strip()
+                if current_passage:
+                    fixed_data['question_text'] = f"{current_passage} {current_question}"
+                    fixed_data['reading_passage'] = ""
             # Copy over unchanged fields
             for key in row:
                 if key not in fixed_data and key != 'id':
                     fixed_data[key] = row[key]
+            # Add the fix cost to the data
+            fixed_data['fix_cost'] = fix_cost
             return fixed_data
 def check_row_quality(row: Dict[str, Any]) -> bool:
     """
+    Check if a row meets quality standards.
+    Returns True if the row is good quality, False if it needs fixing.
     """
+    # Skip if already marked as fixed
+    if row.get('is_fixed', False):
         return True
+    # Check for image-related questions that should be deleted
+    question_text = row.get('question_text', '').lower()
+    reading_passage = row.get('reading_passage', '').lower()
+    # Keywords that indicate image-based questions
+    image_keywords = [
+        'image', 'picture', 'diagram', 'graph', 'figure', 'photo', 'illustration',
+        'shown', 'depicted', 'displayed', 'above', 'below', 'following figure',
+        'look at the', 'in this picture', 'as shown', 'pictured'
     ]
+    # Check if question or passage refers to images
+    if any(keyword in question_text for keyword in image_keywords) or \
+       any(keyword in reading_passage for keyword in image_keywords):
+        logger.info(f"Row {row.get('id')}: Marked for deletion - contains image references")
+        return None  # Return None to indicate deletion
+    # Basic validation for required fields
+    if not row.get('question_text') or not row.get('explanation'):
+        logger.info(f"Row {row.get('id')}: Marked for deletion - missing required fields")
+        return None
+    if not all(row.get(f'option_{opt}') for opt in ['a', 'b', 'c', 'd']):
+        logger.info(f"Row {row.get('id')}: Marked for deletion - missing options")
+        return None
+    if not is_valid_correct_answer(row.get('correct_answer', '')):
+        logger.info(f"Row {row.get('id')}: Marked for deletion - invalid correct answer")
+        return None
+    # Option quality checks
+    options = [row.get(f'option_{opt}', '').strip() for opt in ['a', 'b', 'c', 'd']]
+    if any(len(opt) < 1 for opt in options):
+        logger.info(f"Row {row.get('id')}: Marked for deletion - empty options")
+        return None
     # Check for duplicate options
+    if len(set(options)) != 4:
+        logger.info(f"Row {row.get('id')}: Marked for deletion - duplicate options")
+        return None
+    # Basic explanation quality check
+    explanation = row.get('explanation', '')
+    if len(explanation) < 50 or not explanation.strip():
+        logger.info(f"Row {row.get('id')}: Marked for deletion - insufficient explanation")
+        return None
     return True
         return False
 def process_row(row: Dict[str, Any], progress_counter: AtomicCounter, total_rows: int, row_number: int) -> Dict[str, Any]:
+    """Process a single row and return the result."""
     try:
         row_id = row.get('id')
+        # Check quality first
+        quality_check = check_row_quality(row)
+        # If quality_check is None, delete the row
+        if quality_check is None:
+            try:
+                supabase.table("exam_contents").delete().eq("id", row_id).execute()
+                logger.info(f"Row {row_id}: Deleted due to quality issues.")
+                return {
+                    'success': True,
+                    'changes_made': ['deleted'],
+                    'row_id': row_id,
+                    'cost': 0.0
+                }
+            except Exception as e:
+                logger.error(f"Row {row_id}: Failed to delete - {str(e)}")
+                return {
+                    'success': False,
+                    'row_id': row_id,
+                    'cost': 0.0
+                }
+        # If row passes quality check, no need to fix
+        if quality_check is True:
+            # Update is_fixed flag
+            try:
+                supabase.table("exam_contents").update({"is_fixed": True}).eq("id", row_id).execute()
+                logger.info(f"Row {row_id}: Already good quality. Marked as fixed.")
+                return {
+                    'success': True,
+                    'changes_made': ['marked_fixed'],
+                    'row_id': row_id,
+                    'cost': 0.0
+                }
+            except Exception as e:
+                logger.error(f"Row {row_id}: Failed to update fixed status - {str(e)}")
+                return {
+                    'success': False,
+                    'row_id': row_id,
+                    'cost': 0.0
+                }
         # Generate fixed content
         fixed_data = generate_fixed_content(row)
         if not fixed_data:
+            logger.error(f"Row {row_id}: Failed to generate fixed content.")
+            return {
+                'success': False,
+                'row_id': row_id,
+                'cost': 0.0
+            }
+        # Track what fields were modified
+        changes_made = []
+        for field in fixed_data:
+            if field in row and fixed_data[field] != row[field]:
+                changes_made.append(field)
+        if changes_made:
+            # Add is_fixed flag
+            fixed_data['is_fixed'] = True
+            # Update in database
+            try:
+                supabase.table("exam_contents").update(fixed_data).eq("id", row_id).execute()
+                change_list = ', '.join(changes_made)
+                logger.info(f"Row {row_id}: Fixed successfully. Modified: {change_list}")
+                return {
+                    'success': True,
+                    'changes_made': changes_made,
+                    'row_id': row_id,
+                    'cost': fixed_data.get('fix_cost', 0.0)  # Include the fix cost
+                }
+            except Exception as e:
+                logger.error(f"Row {row_id}: Failed to update - {str(e)}")
+                return {
+                    'success': False,
+                    'row_id': row_id,
+                    'cost': 0.0
+                }
         else:
+            # No changes needed, just mark as fixed
+            try:
+                supabase.table("exam_contents").update({"is_fixed": True}).eq("id", row_id).execute()
+                logger.info(f"Row {row_id}: Fixed successfully. Modified: No changes needed")
+                return {
+                    'success': True,
+                    'changes_made': ['marked_fixed'],
+                    'row_id': row_id,
+                    'cost': fixed_data.get('fix_cost', 0.0)  # Include the fix cost even if no changes
+                }
+            except Exception as e:
+                logger.error(f"Row {row_id}: Failed to update fixed status - {str(e)}")
+                return {
+                    'success': False,
+                    'row_id': row_id,
+                    'cost': 0.0
+                }
     except Exception as e:
+        logger.error(f"Error processing row {row.get('id', 'unknown')}: {str(e)}")
+        return {
+            'success': False,
+            'row_id': row.get('id', 'unknown'),
+            'cost': 0.0
+        }
 def fetch_all_unfixed_rows(supabase_client: Client, batch_size: int = 1000):
     """
         total_rows = 0
         success_count = 0
         failure_count = 0
+        total_cost = 0.0
         changes_by_field = {
             'reading_passage': 0,
             'question_text': 0,
                         # Update changes counter
                         for field in result['changes_made']:
                             changes_by_field[field] = changes_by_field.get(field, 0) + 1
+                        # Add cost if available
+                        if 'cost' in result:
+                            total_cost += result['cost']
                     else:
                         failure_count += 1
                     pbar.update(1)
             f"Total questions processed: {total_rows}",
             f"Successful updates: {success_count}",
             f"Failed updates: {failure_count}",
+            f"Total cost: ${total_cost:.6f}",
             f"Execution time: {execution_time:.2f} seconds",
             "\nChanges by field:",
             *[f"- {field}: {count}" for field, count in changes_by_field.items() if count > 0],