Spaces:

poemsforaphrodite
/

rag_ielts

Running

App Files Files Community

poemsforaphrodite commited on Jan 30

Commit

bc93e69

verified ·

1 Parent(s): 0efda62

Update fix.py

Browse files

Files changed (1) hide show

fix.py +380 -187

fix.py CHANGED Viewed

@@ -1,30 +1,71 @@
 # fix.py
 import os
 import json
 import logging
-import re
-from typing import Dict, Any, Optional
-from io import BytesIO
 import concurrent.futures
-from threading import Lock
 import queue
-import openai
 from supabase import create_client, Client
 from dotenv import load_dotenv
-from tqdm import tqdm  # For progress bar
-from openai import AzureOpenAI
-# Set up logging with thread safety
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(threadName)s - %(levelname)s - %(message)s',
-    handlers=[
-        logging.FileHandler('fix.log'),
-        logging.StreamHandler()
-    ]
-)
 # Load environment variables from .env file (if present)
 load_dotenv()
@@ -33,6 +74,7 @@ load_dotenv()
 MIN_PASSAGE_WORDS = 100  # Minimum number of words for reading_passage
 VALID_CORRECT_ANSWERS = {'A', 'B', 'C', 'D'}
 EXAM_TYPES = ["SAT", "IELTS", "TOEFL"]
 # Load environment variables
 SUPABASE_URL = os.getenv("SUPABASE_DB_URL")
@@ -56,38 +98,25 @@ if not AZURE_OPENAI_DEPLOYMENT_NAME:
     missing_vars.append("AZURE_OPENAI_DEPLOYMENT_NAME")
 if missing_vars:
-    logging.error(f"Missing environment variables: {', '.join(missing_vars)}")
-    raise EnvironmentError(f"Missing environment variables: {', '.join(missing_vars)}")
 # Initialize Supabase client
 supabase: Client = create_client(SUPABASE_URL, SUPABASE_API_KEY)
-logging.info("Connected to Supabase successfully.")
-# Initialize OpenAI for Azure
-openai.api_type = "azure"
-openai.api_key = AZURE_OPENAI_KEY
-openai.api_base = AZURE_OPENAI_ENDPOINT
-openai.api_version = AZURE_OPENAI_API_VERSION
-# Set up Azure OpenAI client
-API_KEY = os.getenv("AZURE_OPENAI_KEY")
-ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
-DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")
-if not API_KEY or not ENDPOINT or not DEPLOYMENT_NAME:
-    raise ValueError("Azure OpenAI configuration is incomplete.")
 client = AzureOpenAI(
-    api_key=API_KEY,
-    api_version="2024-02-15-preview",
-    azure_endpoint=ENDPOINT
 )
 # Thread-safe counter for progress tracking
 class AtomicCounter:
     def __init__(self, initial=0):
         self._value = initial
-        self._lock = Lock()
     def increment(self):
         with self._lock:
@@ -98,6 +127,133 @@ class AtomicCounter:
         with self._lock:
             return self._value
 def word_count(text: str) -> int:
     """Returns the number of words in a given text."""
     return len(text.split())
@@ -120,111 +276,70 @@ def check_row_quality(row: Dict[str, Any]) -> bool:
     # Skip if already fixed
     if row.get('is_fixed'):
         return True
     required_fields = [
         'exam_type', 'content_type', 'exam_section', 'domain', 'subdomain',
         'topic', 'difficulty_level', 'reading_passage', 'question_text',
         'option_a', 'option_b', 'option_c', 'option_d', 'correct_answer',
         'explanation'
     ]
     # Check for missing or empty required fields
     for field in required_fields:
-        if not row.get(field):
             return False
-    # Check for OCR artifacts in text fields
     text_fields = ['reading_passage', 'question_text', 'option_a', 'option_b', 'option_c', 'option_d', 'explanation']
     for field in text_fields:
         text = row.get(field, '')
         if isinstance(text, str):
-            if 'arebasedonthe' in text or text.count('.') > 20 or 'Line' in text:
                 return False
-    return True
-def generate_fixed_content(row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
-    """
-    Uses Azure OpenAI to generate fixed content for a row.
-    Returns a dictionary with fixed fields or None if failed.
-    """
-    prompt = f"""Fix and improve the following exam question. Clean up any OCR artifacts, fix formatting issues, and ensure high quality.
-Current Question:
-Reading Passage: {row.get('reading_passage', '')}
-Question: {row.get('question_text', '')}
-Options:
-A) {row.get('option_a', '')}
-B) {row.get('option_b', '')}
-C) {row.get('option_c', '')}
-D) {row.get('option_d', '')}
-Correct Answer: {row.get('correct_answer', '')}
-Explanation: {row.get('explanation', '')}
-Requirements:
-1. Clean up any OCR artifacts and formatting issues
-2. Maintain the same meaning and difficulty level
-3. Keep the same correct answer
-4. Ensure the explanation clearly justifies the answer
-5. Make sure all text is properly formatted and readable
-6. Preserve all important content and details
-7. Fix any spacing or punctuation issues
-Return a JSON object with the following fields:
-{{
-    "reading_passage": "cleaned passage",
-    "question_text": "cleaned question",
-    "option_a": "cleaned option A",
-    "option_b": "cleaned option B",
-    "option_c": "cleaned option C",
-    "option_d": "cleaned option D",
-    "explanation": "cleaned explanation"
-}}"""
-    try:
-        response = client.chat.completions.create(
-            model=DEPLOYMENT_NAME,
-            messages=[
-                {
-                    "role": "system",
-                    "content": "You are an expert at fixing and improving exam questions. Clean up formatting while preserving meaning."
-                },
-                {"role": "user", "content": prompt}
-            ],
-            response_format={"type": "json_object"},
-            temperature=0.0
-        )
-        fixed_content = json.loads(response.choices[0].message.content)
-        # Preserve original fields and update only the fixed ones
-        updated_data = row.copy()
-        updated_data.update(fixed_content)
-        updated_data['is_fixed'] = True
-        return updated_data
-    except Exception as e:
-        logging.error(f"Error generating fixed content: {str(e)}")
-        return None
-def extract_json(text: str) -> Optional[str]:
-    """
-    Extracts JSON object from a block of text.
-    Returns the JSON string or None if not found.
-    """
-    try:
-        # Find the first { and the last }
-        start = text.find('{')
-        end = text.rfind('}')
-        if start == -1 or end == -1:
-            return None
-        json_str = text[start:end+1]
-        # Validate JSON
-        json.loads(json_str)
-        return json_str
-    except json.JSONDecodeError:
-        return None
 def update_row_in_supabase(row_id: str, fixed_data: Dict[str, Any]) -> bool:
     """
@@ -233,117 +348,195 @@ def update_row_in_supabase(row_id: str, fixed_data: Dict[str, Any]) -> bool:
     """
     try:
         response = supabase.table("exam_contents").update(fixed_data).eq("id", row_id).execute()
         # Check if data exists in the response
         if response.data:
-            logging.info(f"Successfully updated row ID {row_id}.")
             return True
         else:
-            logging.error(f"Failed to update row ID {row_id}.")
             return False
     except Exception as e:
-        logging.error(f"Exception while updating row ID {row_id}: {str(e)}")
         return False
-def process_row(row: Dict[str, Any], progress_counter: AtomicCounter, total_rows: int) -> Dict[str, Any]:
-    """
-    Process a single row with progress tracking.
-    Returns a dictionary with the results.
-    """
-    row_id = row.get('id')
     result = {
-        'row_id': row_id,
         'success': False,
-        'message': ''
     }
     try:
         if not row_id:
             result['message'] = "Row without ID found"
             return result
         if check_row_quality(row):
             success = update_row_in_supabase(row_id, {'is_fixed': True})
             result['success'] = success
-            result['message'] = "Good quality, marked as fixed"
             progress_counter.increment()
             return result
         fixed_data = generate_fixed_content(row)
         if not fixed_data:
             result['message'] = "Failed to fix content"
             progress_counter.increment()
             return result
         success = update_row_in_supabase(row_id, fixed_data)
         result['success'] = success
-        result['message'] = "Successfully fixed and updated" if success else "Failed to update"
     except Exception as e:
         result['message'] = f"Error: {str(e)}"
-        logging.error(f"Error processing row {row_id}: {str(e)}")
     progress_counter.increment()
-    progress = progress_counter.value()
-    if progress % 10 == 0:  # Update progress every 10 rows
-        print(f"Progress: {progress}/{total_rows} rows processed")
     return result
-def main():
     """
-    Main function to process and fix exam questions in Supabase using multithreading.
-    """
-    logging.info("Starting fix.py script with multithreading.")
-    try:
-        # Fetch only unfixed rows from exam_contents
-        response = supabase.table("exam_contents").select("*").eq("is_fixed", False).execute()
-        rows = response.data
-        total_rows = len(rows)
-        logging.info(f"Fetched {total_rows} unfixed rows from exam_contents.")
-        if total_rows == 0:
-            logging.info("No unfixed rows found in exam_contents. Exiting.")
-            print("No unfixed rows found in exam_contents. Exiting.")
-            return
         # Initialize counters
-        progress_counter = AtomicCounter()
         success_count = 0
         failure_count = 0
         # Create a thread pool
-        max_workers = min(32, total_rows)  # Cap at 32 threads or total rows, whichever is smaller
-        print(f"Starting processing with {max_workers} threads...")
         with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
-            # Submit all rows for processing
-            future_to_row = {
-                executor.submit(process_row, row, progress_counter, total_rows): row
-                for row in rows
-            }
-            # Process completed futures as they finish
-            for future in concurrent.futures.as_completed(future_to_row):
-                result = future.result()
-                if result['success']:
-                    success_count += 1
-                else:
-                    failure_count += 1
-                    logging.warning(f"Failed to process row {result['row_id']}: {result['message']}")
-        # Final statistics
-        logging.info(f"Processing completed. Success: {success_count}, Failures: {failure_count}")
-        print(f"\nProcessing completed:")
-        print(f"Total rows processed: {total_rows}")
-        print(f"Successful updates: {success_count}")
-        print(f"Failed updates: {failure_count}")
     except Exception as e:
-        logging.error(f"An unexpected error occurred: {str(e)}")
-        print(f"An unexpected error occurred: {str(e)}")
 if __name__ == "__main__":
     main()

 # fix.py
 import os
+import re
 import json
 import logging
 import concurrent.futures
+from typing import Dict, Any, Optional, List
 import queue
+import time
+from datetime import datetime
+import threading
+import functools
+from openai import AzureOpenAI
 from supabase import create_client, Client
+from tqdm import tqdm
 from dotenv import load_dotenv
+from ratelimiter import RateLimiter
+# Set up logging with thread safety and custom formatting
+class CustomFormatter(logging.Formatter):
+    """Custom formatter with colors and better formatting"""
+    grey = "\x1b[38;21m"
+    blue = "\x1b[38;5;39m"
+    yellow = "\x1b[38;5;226m"
+    red = "\x1b[38;5;196m"
+    bold_red = "\x1b[31;1m"
+    reset = "\x1b[0m"
+    def __init__(self, fmt):
+        super().__init__()
+        self.fmt = fmt
+        self.FORMATS = {
+            logging.DEBUG: self.grey + self.fmt + self.reset,
+            logging.INFO: self.blue + self.fmt + self.reset,
+            logging.WARNING: self.yellow + self.fmt + self.reset,
+            logging.ERROR: self.red + self.fmt + self.reset,
+            logging.CRITICAL: self.bold_red + self.fmt + self.reset
+        }
+    def format(self, record):
+        log_fmt = self.FORMATS.get(record.levelno)
+        formatter = logging.Formatter(log_fmt)
+        return formatter.format(record)
+# Set up logging configuration
+logger = logging.getLogger('fix')
+logger.setLevel(logging.INFO)
+# File handler with simple formatting
+file_handler = logging.FileHandler('fix.log')
+file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
+logger.addHandler(file_handler)
+# Console handler with color formatting
+console_handler = logging.StreamHandler()
+console_handler.setFormatter(CustomFormatter('%(asctime)s - %(levelname)s - %(message)s'))
+logger.addHandler(console_handler)
+# Create a summary log file for each run
+current_time = datetime.now().strftime('%Y%m%d_%H%M%S')
+summary_file = f'fix_summary_{current_time}.log'
+summary_handler = logging.FileHandler(summary_file)
+summary_handler.setFormatter(logging.Formatter('%(message)s'))
+summary_logger = logging.getLogger('summary')
+summary_logger.addHandler(summary_handler)
+summary_logger.setLevel(logging.INFO)
 # Load environment variables from .env file (if present)
 load_dotenv()
 MIN_PASSAGE_WORDS = 100  # Minimum number of words for reading_passage
 VALID_CORRECT_ANSWERS = {'A', 'B', 'C', 'D'}
 EXAM_TYPES = ["SAT", "IELTS", "TOEFL"]
+DIFFICULTY_LEVELS = ["Easy", "Medium", "Hard"]
 # Load environment variables
 SUPABASE_URL = os.getenv("SUPABASE_DB_URL")
     missing_vars.append("AZURE_OPENAI_DEPLOYMENT_NAME")
 if missing_vars:
+    error_msg = f"Missing required environment variables: {', '.join(missing_vars)}"
+    logger.error(error_msg)
+    raise ValueError(error_msg)
 # Initialize Supabase client
 supabase: Client = create_client(SUPABASE_URL, SUPABASE_API_KEY)
+# Initialize Azure OpenAI client
 client = AzureOpenAI(
+    api_key=AZURE_OPENAI_KEY,
+    api_version=AZURE_OPENAI_API_VERSION,
+    azure_endpoint=AZURE_OPENAI_ENDPOINT
 )
 # Thread-safe counter for progress tracking
 class AtomicCounter:
     def __init__(self, initial=0):
         self._value = initial
+        self._lock = threading.Lock()
     def increment(self):
         with self._lock:
         with self._lock:
             return self._value
+class RateLimiter:
+    """Rate limiter implementation using token bucket algorithm"""
+    def __init__(self, max_calls: int, period: float):
+        self.max_calls = max_calls
+        self.period = period
+        self.calls = []
+        self.lock = threading.Lock()
+    def __call__(self, func):
+        @functools.wraps(func)
+        def wrapped(*args, **kwargs):
+            with self.lock:
+                now = time.time()
+                # Remove old calls outside the window
+                self.calls = [call for call in self.calls if call > now - self.period]
+                if len(self.calls) >= self.max_calls:
+                    sleep_time = self.calls[0] - (now - self.period)
+                    if sleep_time > 0:
+                        time.sleep(sleep_time)
+                        # Recalculate after sleep
+                        now = time.time()
+                        self.calls = [call for call in self.calls if call > now - self.period]
+                self.calls.append(now)
+            return func(*args, **kwargs)
+        return wrapped
+# Initialize Rate Limiter: 60 calls per minute
+rate_limiter = RateLimiter(max_calls=60, period=60)
+@rate_limiter
+def generate_fixed_content(row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+    """
+    Uses Azure OpenAI to generate fixed content for a row.
+    Returns a dictionary with fixed content or None if generation fails.
+    """
+    try:
+        # Create system message with formatting requirements
+        system_message = """You are an expert at fixing exam questions. Follow these rules:
+1. Maintain academic language and tone
+2. Keep all factual information unchanged
+3. Fix grammar and clarity issues
+4. Ensure options are clear and distinct
+5. Format text consistently"""
+        # Create user message with the content to fix
+        user_message = f"""Please fix the following exam question content:
+Reading Passage: {row.get('reading_passage', '')}
+Question: {row.get('question_text', '')}
+Options:
+A) {row.get('option_a', '')}
+B) {row.get('option_b', '')}
+C) {row.get('option_c', '')}
+D) {row.get('option_d', '')}
+Explanation: {row.get('explanation', '')}"""
+        # Call Azure OpenAI API
+        response = client.chat.completions.create(
+            model=AZURE_OPENAI_DEPLOYMENT_NAME,
+            messages=[
+                {"role": "system", "content": system_message},
+                {"role": "user", "content": user_message}
+            ],
+            temperature=0.3,
+            max_tokens=2000,
+            top_p=0.95,
+            frequency_penalty=0,
+            presence_penalty=0
+        )
+        # Extract the response content
+        if not response.choices:
+            logger.error("No response generated from OpenAI")
+            return None
+        content = response.choices[0].message.content
+        # Parse the response using regex
+        fixed_data = {}
+        # Extract reading passage
+        reading_match = re.search(r"Reading Passage:\s*(.*?)(?=Question:|$)", content, re.DOTALL)
+        if reading_match:
+            fixed_data['reading_passage'] = reading_match.group(1).strip()
+        # Extract question
+        question_match = re.search(r"Question:\s*(.*?)(?=Options:|$)", content, re.DOTALL)
+        if question_match:
+            fixed_data['question_text'] = question_match.group(1).strip()
+        # Extract options
+        options_pattern = {
+            'option_a': r"A\)\s*(.*?)(?=B\)|$)",
+            'option_b': r"B\)\s*(.*?)(?=C\)|$)",
+            'option_c': r"C\)\s*(.*?)(?=D\)|$)",
+            'option_d': r"D\)\s*(.*?)(?=Explanation:|$)"
+        }
+        for key, pattern in options_pattern.items():
+            match = re.search(pattern, content, re.DOTALL)
+            if match:
+                fixed_data[key] = match.group(1).strip()
+        # Extract explanation
+        explanation_match = re.search(r"Explanation:\s*(.*?)$", content, re.DOTALL)
+        if explanation_match:
+            fixed_data['explanation'] = explanation_match.group(1).strip()
+        # Validate that all required fields are present
+        required_fields = ['reading_passage', 'question_text', 'option_a', 'option_b', 'option_c', 'option_d', 'explanation']
+        if not all(field in fixed_data for field in required_fields):
+            logger.error("Missing required fields in generated content")
+            return None
+        # Copy over unchanged fields
+        for key in row:
+            if key not in fixed_data and key != 'id':
+                fixed_data[key] = row[key]
+        return fixed_data
+    except Exception as e:
+        logger.error(f"Error generating fixed content: {str(e)}")
+        return None
 def word_count(text: str) -> int:
     """Returns the number of words in a given text."""
     return len(text.split())
     # Skip if already fixed
     if row.get('is_fixed'):
         return True
+    # Required fields must be present and non-empty
     required_fields = [
         'exam_type', 'content_type', 'exam_section', 'domain', 'subdomain',
         'topic', 'difficulty_level', 'reading_passage', 'question_text',
         'option_a', 'option_b', 'option_c', 'option_d', 'correct_answer',
         'explanation'
     ]
     # Check for missing or empty required fields
     for field in required_fields:
+        value = row.get(field, '').strip() if isinstance(row.get(field), str) else row.get(field)
+        if not value:
             return False
+    # Check for valid exam type
+    if row['exam_type'] not in EXAM_TYPES:
+        return False
+    # Check for valid difficulty level
+    if row['difficulty_level'] not in DIFFICULTY_LEVELS:
+        return False
+    # Check for valid correct answer format
+    if not is_valid_correct_answer(row['correct_answer']):
+        return False
+    # Check for common OCR and formatting issues
     text_fields = ['reading_passage', 'question_text', 'option_a', 'option_b', 'option_c', 'option_d', 'explanation']
     for field in text_fields:
         text = row.get(field, '')
         if isinstance(text, str):
+            # Check for OCR artifacts
+            if any(artifact in text.lower() for artifact in [
+                'arebasedonthe', 'lineno', 'click here', 'seenext', 'seebelow',
+                'answerthefollowing', 'choosethebest', 'selectthe'
+            ]):
+                return False
+            # Check for formatting issues
+            if text.count('.') > 20:  # Too many periods might indicate formatting issues
+                return False
+            if text.count('\n') > 20:  # Too many newlines might indicate formatting issues
+                return False
+            if len(text.split()) < 2:  # Text should have at least 2 words
                 return False
+    # Check minimum length requirements
+    if len(row['reading_passage'].split()) < MIN_PASSAGE_WORDS:
+        return False
+    # Check for duplicate options
+    options = [row['option_a'], row['option_b'], row['option_c'], row['option_d']]
+    if len(set(options)) != len(options):
+        return False
+    # Check for explanation quality
+    explanation = row['explanation']
+    if len(explanation.split()) < 10:  # Explanation should be reasonably detailed
+        return False
+    if not any(word in explanation.lower() for word in ['because', 'since', 'as', 'therefore', 'thus', 'hence']):
+        return False
+    return True
 def update_row_in_supabase(row_id: str, fixed_data: Dict[str, Any]) -> bool:
     """
     """
     try:
         response = supabase.table("exam_contents").update(fixed_data).eq("id", row_id).execute()
         # Check if data exists in the response
         if response.data:
+            logger.debug(f"HTTP Request: PATCH https://{SUPABASE_URL}/rest/v1/exam_contents?id=eq.{row_id} \"HTTP/2 200 OK\"")
+            logger.info(f"Row {row_id}: Successfully updated.")
             return True
         else:
+            logger.error(f"Row {row_id}: Failed to update.")
             return False
     except Exception as e:
+        logger.error(f"Row {row_id}: Exception while updating - {str(e)}")
         return False
+def process_row(row: Dict[str, Any], progress_counter: AtomicCounter, total_rows: int, row_number: int) -> Dict[str, Any]:
+    """Process a single row with progress tracking."""
     result = {
+        'row_id': row.get('id'),
         'success': False,
+        'message': '',
+        'changes_made': []
     }
     try:
+        row_id = row.get('id')
         if not row_id:
             result['message'] = "Row without ID found"
+            logger.warning(f"Row {row_number}: {result['message']}")
             return result
+        # Check initial quality
+        initial_quality_issues = []
+        if not row.get('reading_passage'):
+            initial_quality_issues.append("Missing reading passage")
+        if not row.get('question_text'):
+            initial_quality_issues.append("Missing question text")
+        if not all(row.get(f'option_{opt}') for opt in ['a', 'b', 'c', 'd']):
+            initial_quality_issues.append("Missing options")
+        if not row.get('correct_answer'):
+            initial_quality_issues.append("Missing correct answer")
+        if initial_quality_issues:
+            logger.info(f"Row {row_number}: Quality issues found - {', '.join(initial_quality_issues)}")
         if check_row_quality(row):
             success = update_row_in_supabase(row_id, {'is_fixed': True})
             result['success'] = success
+            result['message'] = "Already good quality, marked as fixed"
+            if success:
+                logger.info(f"Row {row_number}: Already good quality. Marked as fixed.")
+            else:
+                logger.error(f"Row {row_number}: Failed to mark as fixed.")
             progress_counter.increment()
             return result
+        # Generate fixed content
         fixed_data = generate_fixed_content(row)
         if not fixed_data:
             result['message'] = "Failed to fix content"
+            logger.error(f"Row {row_number}: Failed to generate fixed content.")
             progress_counter.increment()
             return result
+        # Compare changes
+        for field in ['reading_passage', 'question_text', 'option_a', 'option_b', 'option_c', 'option_d', 'explanation']:
+            if fixed_data.get(field) != row.get(field):
+                result['changes_made'].append(field)
+        fixed_data['is_fixed'] = True
         success = update_row_in_supabase(row_id, fixed_data)
         result['success'] = success
+        if success:
+            changes = ', '.join(result['changes_made']) if result['changes_made'] else 'No changes needed'
+            result['message'] = f"Fixed successfully. Changes in: {changes}"
+            logger.info(f"Row {row_number}: Fixed successfully. Modified: {changes}")
+        else:
+            result['message'] = "Failed to update after fixing"
+            logger.error(f"Row {row_number}: Failed to update after fixing.")
     except Exception as e:
         result['message'] = f"Error: {str(e)}"
+        logger.error(f"Row {row_number}: Error processing - {str(e)}")
     progress_counter.increment()
     return result
+def fetch_all_unfixed_rows(supabase_client: Client, batch_size: int = 1000):
     """
+    Fetches all unfixed rows from the exam_contents table in batches.
+    Args:
+        supabase_client (Client): The Supabase client instance.
+        batch_size (int): Number of rows to fetch per batch.
+    Yields:
+        List[Dict[str, Any]]: A batch of rows.
+    """
+    # Initialize the starting range
+    start = 0
+    while True:
+        # Fetch a batch of rows
+        response = supabase_client.table("exam_contents")\
+            .select("*")\
+            .eq("is_fixed", False)\
+            .range(start, start + batch_size - 1)\
+            .execute()
+        batch = response.data
+        if not batch:
+            break  # No more rows to fetch
+        yield batch
+        start += batch_size
+def main():
+    """Main function to process and fix exam questions in Supabase using multithreading."""
+    start_time = time.time()
+    logger.info("Starting fix.py script")
+    summary_logger.info("\n=== Question Fix Summary ===\n")
+    try:
         # Initialize counters
+        total_rows = 0
         success_count = 0
         failure_count = 0
+        changes_by_field = {
+            'reading_passage': 0,
+            'question_text': 0,
+            'option_a': 0,
+            'option_b': 0,
+            'option_c': 0,
+            'option_d': 0,
+            'explanation': 0
+        }
         # Create a thread pool
+        max_workers = min(32, os.cpu_count() * 2)  # Adjust based on CPU cores
+        logger.info(f"Initializing with {max_workers} threads")
         with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+            # Initialize progress tracking
+            progress_counter = AtomicCounter()
+            futures = []
+            # Process rows in batches
+            for batch in fetch_all_unfixed_rows(supabase):
+                total_rows += len(batch)
+                for i, row in enumerate(batch):
+                    future = executor.submit(process_row, row, progress_counter, total_rows, i + 1)
+                    futures.append(future)
+            # Track progress with tqdm
+            with tqdm(total=total_rows, desc="Processing Rows", unit="row", dynamic_ncols=True) as pbar:
+                for future in concurrent.futures.as_completed(futures):
+                    result = future.result()
+                    if result['success']:
+                        success_count += 1
+                        # Update changes counter
+                        for field in result['changes_made']:
+                            changes_by_field[field] = changes_by_field.get(field, 0) + 1
+                    else:
+                        failure_count += 1
+                    pbar.update(1)
+        # Calculate execution time
+        execution_time = time.time() - start_time
+        # Log final statistics
+        summary = [
+            "\n=== Final Statistics ===",
+            f"Total questions processed: {total_rows}",
+            f"Successful updates: {success_count}",
+            f"Failed updates: {failure_count}",
+            f"Execution time: {execution_time:.2f} seconds",
+            "\nChanges by field:",
+            *[f"- {field}: {count}" for field, count in changes_by_field.items() if count > 0],
+            "\n=== End of Summary ===\n"
+        ]
+        # Log to both console and summary file
+        for line in summary:
+            logger.info(line)
+            summary_logger.info(line)
     except Exception as e:
+        error_msg = f"An unexpected error occurred: {str(e)}"
+        logger.error(error_msg)
+        summary_logger.error(error_msg)
 if __name__ == "__main__":
     main()