Spaces:

poemsforaphrodite
/

rag_ielts

Sleeping

File size: 22,262 Bytes

# fix.py

import concurrent.futures
import functools
import json
import logging
import os
import re
import threading
import time
from datetime import datetime
from typing import Any, Dict, Optional

from dotenv import load_dotenv
from openai import AzureOpenAI
from ratelimiter import RateLimiter
from supabase import Client, create_client
from tqdm import tqdm


# Set up logging with thread safety and custom formatting
class CustomFormatter(logging.Formatter):
    """Custom formatter with colors and better formatting"""
    grey = "\x1b[38;21m"
    blue = "\x1b[38;5;39m"
    yellow = "\x1b[38;5;226m"
    red = "\x1b[38;5;196m"
    bold_red = "\x1b[31;1m"
    reset = "\x1b[0m"

    def __init__(self, fmt):
        super().__init__()
        self.fmt = fmt
        self.FORMATS = {
            logging.DEBUG: self.grey + self.fmt + self.reset,
            logging.INFO: self.blue + self.fmt + self.reset,
            logging.WARNING: self.yellow + self.fmt + self.reset,
            logging.ERROR: self.red + self.fmt + self.reset,
            logging.CRITICAL: self.bold_red + self.fmt + self.reset
        }

    def format(self, record):
        log_fmt = self.FORMATS.get(record.levelno)
        formatter = logging.Formatter(log_fmt)
        return formatter.format(record)

# Set up logging configuration
logger = logging.getLogger('fix')
logger.setLevel(logging.INFO)

# File handler with simple formatting
file_handler = logging.FileHandler('fix.log')
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logger.addHandler(file_handler)

# Console handler with color formatting
console_handler = logging.StreamHandler()
console_handler.setFormatter(CustomFormatter('%(asctime)s - %(levelname)s - %(message)s'))
logger.addHandler(console_handler)

# Create a summary log file for each run
current_time = datetime.now().strftime('%Y%m%d_%H%M%S')
summary_file = f'fix_summary_{current_time}.log'
summary_handler = logging.FileHandler(summary_file)
summary_handler.setFormatter(logging.Formatter('%(message)s'))
summary_logger = logging.getLogger('summary')
summary_logger.addHandler(summary_handler)
summary_logger.setLevel(logging.INFO)

# Load environment variables from .env file (if present)
load_dotenv()

# Constants
MIN_PASSAGE_WORDS = 100  # Minimum number of words for reading_passage
VALID_CORRECT_ANSWERS = {'A', 'B', 'C', 'D'}
EXAM_TYPES = ["SAT", "IELTS", "TOEFL"]
DIFFICULTY_LEVELS = ["Easy", "Medium", "Hard"]

# Load environment variables
SUPABASE_URL = os.getenv("SUPABASE_DB_URL")
SUPABASE_API_KEY = os.getenv("SUPABASE_API_KEY")
AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME_FIX", "gpt-4o-mini")  # Use specific deployment for fixing
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "2023-05-15")

# Validate environment variables
missing_vars = []
if not SUPABASE_URL:
    missing_vars.append("SUPABASE_DB_URL")
if not SUPABASE_API_KEY:
    missing_vars.append("SUPABASE_API_KEY")
if not AZURE_OPENAI_KEY:
    missing_vars.append("AZURE_OPENAI_KEY")
if not AZURE_OPENAI_ENDPOINT:
    missing_vars.append("AZURE_OPENAI_ENDPOINT")

if missing_vars:
    error_msg = f"Missing required environment variables: {', '.join(missing_vars)}"
    logger.error(error_msg)
    raise ValueError(error_msg)

# Initialize Supabase client
supabase: Client = create_client(SUPABASE_URL, SUPABASE_API_KEY)

# Initialize Azure OpenAI client
client = AzureOpenAI(
    api_key=AZURE_OPENAI_KEY,
    api_version=AZURE_OPENAI_API_VERSION,
    azure_endpoint=AZURE_OPENAI_ENDPOINT
)

# Thread-safe counter for progress tracking
class AtomicCounter:
    def __init__(self, initial=0):
        self._value = initial
        self._lock = threading.Lock()

    def increment(self):
        with self._lock:
            self._value += 1
            return self._value

    def value(self):
        with self._lock:
            return self._value

class RateLimiter:
    """Rate limiter implementation using token bucket algorithm"""
    def __init__(self, max_calls: int, period: float):
        self.max_calls = max_calls
        self.period = period
        self.calls = []
        self.lock = threading.Lock()

    def __call__(self, func):
        @functools.wraps(func)
        def wrapped(*args, **kwargs):
            with self.lock:
                now = time.time()
                # Remove old calls outside the window
                self.calls = [call for call in self.calls if call > now - self.period]
                
                if len(self.calls) >= self.max_calls:
                    sleep_time = self.calls[0] - (now - self.period)
                    if sleep_time > 0:
                        time.sleep(sleep_time)
                        # Recalculate after sleep
                        now = time.time()
                        self.calls = [call for call in self.calls if call > now - self.period]
                
                self.calls.append(now)
                
            return func(*args, **kwargs)
        return wrapped

# Initialize Rate Limiter: 60 calls per minute
rate_limiter = RateLimiter(max_calls=60, period=60)

@rate_limiter
def generate_fixed_content(row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
    """
    Uses Azure OpenAI to generate fixed content for a row.
    Returns a dictionary with fixed content or None if generation fails.
    """
    try:
        # Determine if this is a math question
        domain = row.get('domain', '').lower()
        is_math = any(math_term in domain.lower() for math_term in ['math', 'algebra', 'geometry', 'calculus', 'arithmetic'])
        
        # Create system message with domain-specific instructions
        system_message = """You are an expert in standardized English test content. You must return your response as a valid JSON object with the following structure:
{
    "reading_passage": "formatted passage text",
    "question_text": "formatted question",
    "option_a": "option A text",
    "option_b": "option B text",
    "option_c": "option C text",
    "option_d": "option D text",
    "explanation": "explanation text"
}"""

        if is_math:
            system_message += """
IMPORTANT: For ALL mathematics questions:
- You MUST set reading_passage to an empty string (""). No exceptions.
- Move any context or problem setup from the reading passage into the question_text
- The question_text should contain all necessary mathematical information
- Format: reading_passage must be "", question_text contains everything

Example math question format:
{
    "reading_passage": "",
    "question_text": "In the given system of equations, y = -1.5 and y = x^2 + 8x + a, where a is a positive constant. The system has exactly one distinct real solution. What is the value of a?",
    ...
}"""
        else:
            system_message += """
For reading comprehension questions:
- Format the reading_passage professionally with proper paragraphing
- Ensure the question is answerable from the passage
- Make answer options clear and distinct
- Reference the passage in the explanation"""

        # Create user message with the content to fix
        user_message = f"""Please format and fix the following exam content, returning a JSON object with the specified structure:

Domain: {domain}

Reading Passage:
{row.get('reading_passage', '')}

Question:
{row.get('question_text', '')}

Options:
A) {row.get('option_a', '')}
B) {row.get('option_b', '')}
C) {row.get('option_c', '')}
D) {row.get('option_d', '')}

Explanation:
{row.get('explanation', '')}"""

        # Call Azure OpenAI API with JSON mode
        response = client.chat.completions.create(
            model=AZURE_OPENAI_DEPLOYMENT_NAME,
            messages=[
                {"role": "system", "content": system_message},
                {"role": "user", "content": user_message}
            ],
            temperature=0.3,
            top_p=0.95,
            frequency_penalty=0,
            presence_penalty=0,
            response_format={"type": "json_object"}
        )

        # Extract the response content
        if not response.choices:
            logger.error("No response generated from OpenAI")
            return None

        content = response.choices[0].message.content

        # Calculate cost (gpt-4o-mini pricing)
        input_tokens = (len(system_message) + len(user_message)) / 4  # Rough estimate: 4 chars per token
        output_tokens = len(content) / 4
        # gpt-4o-mini pricing:
        # Input: $0.300 per 1M tokens
        # Output: $1.200 per 1M tokens
        fix_cost = (input_tokens / 1_000_000 * 0.300) + (output_tokens / 1_000_000 * 1.200)
        logger.info(f"Estimated cost for fixing this question: ${fix_cost:.6f}")

        try:
            # Parse JSON response
            fixed_data = json.loads(content)
            
            # For math questions, ensure reading passage is empty
            if is_math and fixed_data.get('reading_passage', '').strip():
                # Move reading passage content to question text if needed
                current_passage = fixed_data.get('reading_passage', '').strip()
                current_question = fixed_data.get('question_text', '').strip()
                if current_passage:
                    fixed_data['question_text'] = f"{current_passage} {current_question}"
                    fixed_data['reading_passage'] = ""
            
            # Copy over unchanged fields
            for key in row:
                if key not in fixed_data and key != 'id':
                    fixed_data[key] = row[key]
            
            # Add the fix cost to the data
            fixed_data['fix_cost'] = fix_cost

            return fixed_data

        except json.JSONDecodeError as e:
            logger.error(f"Failed to parse JSON response: {str(e)}")
            return None
        except Exception as e:
            logger.error(f"Error processing response: {str(e)}")
            return None

    except Exception as e:
        logger.error(f"Error generating fixed content: {str(e)}")
        return None

def word_count(text: str) -> int:
    """Returns the number of words in a given text."""
    return len(text.split())

def is_valid_correct_answer(answer: str) -> bool:
    """Checks if the correct_answer is one of A, B, C, D."""
    return answer.upper() in VALID_CORRECT_ANSWERS

def clean_text(text: str) -> str:
    """Cleans the text by removing unwanted characters and extra spaces."""
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with single space
    text = text.strip()
    return text

def check_row_quality(row: Dict[str, Any]) -> bool:
    """
    Check if a row meets quality standards.
    Returns True if the row is good quality, False if it needs fixing.
    """
    # Skip if already marked as fixed
    if row.get('is_fixed', False):
        return True

    # Check for image-related questions that should be deleted
    question_text = row.get('question_text', '').lower()
    reading_passage = row.get('reading_passage', '').lower()
    
    # Keywords that indicate image-based questions
    image_keywords = [
        'image', 'picture', 'diagram', 'graph', 'figure', 'photo', 'illustration',
        'shown', 'depicted', 'displayed', 'above', 'below', 'following figure',
        'look at the', 'in this picture', 'as shown', 'pictured'
    ]
    
    # Check if question or passage refers to images
    if any(keyword in question_text for keyword in image_keywords) or \
       any(keyword in reading_passage for keyword in image_keywords):
        logger.info(f"Row {row.get('id')}: Marked for deletion - contains image references")
        return None  # Return None to indicate deletion
        
    # Basic validation for required fields
    if not row.get('question_text') or not row.get('explanation'):
        logger.info(f"Row {row.get('id')}: Marked for deletion - missing required fields")
        return None
        
    if not all(row.get(f'option_{opt}') for opt in ['a', 'b', 'c', 'd']):
        logger.info(f"Row {row.get('id')}: Marked for deletion - missing options")
        return None
        
    if not is_valid_correct_answer(row.get('correct_answer', '')):
        logger.info(f"Row {row.get('id')}: Marked for deletion - invalid correct answer")
        return None

    # Option quality checks
    options = [row.get(f'option_{opt}', '').strip() for opt in ['a', 'b', 'c', 'd']]
    if any(len(opt) < 1 for opt in options):
        logger.info(f"Row {row.get('id')}: Marked for deletion - empty options")
        return None
        
    # Check for duplicate options
    if len(set(options)) != 4:
        logger.info(f"Row {row.get('id')}: Marked for deletion - duplicate options")
        return None
        
    # Basic explanation quality check
    explanation = row.get('explanation', '')
    if len(explanation) < 50 or not explanation.strip():
        logger.info(f"Row {row.get('id')}: Marked for deletion - insufficient explanation")
        return None

    return True

def update_row_in_supabase(row_id: str, fixed_data: Dict[str, Any]) -> bool:
    """
    Updates a row in Supabase with fixed data.
    Returns True if successful, False otherwise.
    """
    try:
        response = supabase.table("exam_contents").update(fixed_data).eq("id", row_id).execute()

        # Check if data exists in the response
        if response.data:
            logger.debug(f"HTTP Request: PATCH https://{SUPABASE_URL}/rest/v1/exam_contents?id=eq.{row_id} \"HTTP/2 200 OK\"")
            logger.info(f"Row {row_id}: Successfully updated.")
            return True
        else:
            logger.error(f"Row {row_id}: Failed to update.")
            return False

    except Exception as e:
        logger.error(f"Row {row_id}: Exception while updating - {str(e)}")
        return False

def process_row(row: Dict[str, Any], progress_counter: AtomicCounter, total_rows: int, row_number: int) -> Dict[str, Any]:
    """Process a single row and return the result."""
    try:
        row_id = row.get('id')
        
        # Check quality first
        quality_check = check_row_quality(row)
        
        # If quality_check is None, delete the row
        if quality_check is None:
            try:
                supabase.table("exam_contents").delete().eq("id", row_id).execute()
                logger.info(f"Row {row_id}: Deleted due to quality issues.")
                return {
                    'success': True,
                    'changes_made': ['deleted'],
                    'row_id': row_id,
                    'cost': 0.0
                }
            except Exception as e:
                logger.error(f"Row {row_id}: Failed to delete - {str(e)}")
                return {
                    'success': False,
                    'row_id': row_id,
                    'cost': 0.0
                }
        
        # If row passes quality check, no need to fix
        if quality_check is True:
            # Update is_fixed flag
            try:
                supabase.table("exam_contents").update({"is_fixed": True}).eq("id", row_id).execute()
                logger.info(f"Row {row_id}: Already good quality. Marked as fixed.")
                return {
                    'success': True,
                    'changes_made': ['marked_fixed'],
                    'row_id': row_id,
                    'cost': 0.0
                }
            except Exception as e:
                logger.error(f"Row {row_id}: Failed to update fixed status - {str(e)}")
                return {
                    'success': False,
                    'row_id': row_id,
                    'cost': 0.0
                }

        # Generate fixed content
        fixed_data = generate_fixed_content(row)
        if not fixed_data:
            logger.error(f"Row {row_id}: Failed to generate fixed content.")
            return {
                'success': False,
                'row_id': row_id,
                'cost': 0.0
            }

        # Track what fields were modified
        changes_made = []
        for field in fixed_data:
            if field in row and fixed_data[field] != row[field]:
                changes_made.append(field)

        if changes_made:
            # Add is_fixed flag
            fixed_data['is_fixed'] = True
            
            # Update in database
            try:
                supabase.table("exam_contents").update(fixed_data).eq("id", row_id).execute()
                change_list = ', '.join(changes_made)
                logger.info(f"Row {row_id}: Fixed successfully. Modified: {change_list}")
                return {
                    'success': True,
                    'changes_made': changes_made,
                    'row_id': row_id,
                    'cost': fixed_data.get('fix_cost', 0.0)  # Include the fix cost
                }
            except Exception as e:
                logger.error(f"Row {row_id}: Failed to update - {str(e)}")
                return {
                    'success': False,
                    'row_id': row_id,
                    'cost': 0.0
                }
        else:
            # No changes needed, just mark as fixed
            try:
                supabase.table("exam_contents").update({"is_fixed": True}).eq("id", row_id).execute()
                logger.info(f"Row {row_id}: Fixed successfully. Modified: No changes needed")
                return {
                    'success': True,
                    'changes_made': ['marked_fixed'],
                    'row_id': row_id,
                    'cost': fixed_data.get('fix_cost', 0.0)  # Include the fix cost even if no changes
                }
            except Exception as e:
                logger.error(f"Row {row_id}: Failed to update fixed status - {str(e)}")
                return {
                    'success': False,
                    'row_id': row_id,
                    'cost': 0.0
                }

    except Exception as e:
        logger.error(f"Error processing row {row.get('id', 'unknown')}: {str(e)}")
        return {
            'success': False,
            'row_id': row.get('id', 'unknown'),
            'cost': 0.0
        }

def fetch_all_unfixed_rows(supabase_client: Client, batch_size: int = 1000):
    """
    Fetches all unfixed rows from the exam_contents table in batches.

    Args:
        supabase_client (Client): The Supabase client instance.
        batch_size (int): Number of rows to fetch per batch.

    Yields:
        List[Dict[str, Any]]: A batch of rows.
    """
    # Initialize the starting range
    start = 0
    while True:
        # Fetch a batch of rows
        response = supabase_client.table("exam_contents")\
            .select("*")\
            .eq("is_fixed", False)\
            .range(start, start + batch_size - 1)\
            .execute()

        batch = response.data
        if not batch:
            break  # No more rows to fetch

        yield batch
        start += batch_size

def main():
    """Main function to process and fix exam questions in Supabase using multithreading."""
    start_time = time.time()
    logger.info("Starting fix.py script")
    summary_logger.info("\n=== Question Fix Summary ===\n")
    
    try:
        # Initialize counters
        total_rows = 0
        success_count = 0
        failure_count = 0
        total_cost = 0.0
        changes_by_field = {
            'reading_passage': 0,
            'question_text': 0,
            'option_a': 0,
            'option_b': 0,
            'option_c': 0,
            'option_d': 0,
            'explanation': 0
        }

        # Create a thread pool
        max_workers = min(32, os.cpu_count() * 2)  # Adjust based on CPU cores
        logger.info(f"Initializing with {max_workers} threads")
        
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            # Initialize progress tracking
            progress_counter = AtomicCounter()
            futures = []
            
            # Process rows in batches
            for batch in fetch_all_unfixed_rows(supabase):
                total_rows += len(batch)
                for i, row in enumerate(batch):
                    future = executor.submit(process_row, row, progress_counter, total_rows, i + 1)
                    futures.append(future)
            
            # Track progress with tqdm
            with tqdm(total=total_rows, desc="Processing Rows", unit="row", dynamic_ncols=True) as pbar:
                for future in concurrent.futures.as_completed(futures):
                    result = future.result()
                    if result['success']:
                        success_count += 1
                        # Update changes counter
                        for field in result['changes_made']:
                            changes_by_field[field] = changes_by_field.get(field, 0) + 1
                        # Add cost if available
                        if 'cost' in result:
                            total_cost += result['cost']
                    else:
                        failure_count += 1
                    pbar.update(1)

        # Calculate execution time
        execution_time = time.time() - start_time
        
        # Log final statistics
        summary = [
            "\n=== Final Statistics ===",
            f"Total questions processed: {total_rows}",
            f"Successful updates: {success_count}",
            f"Failed updates: {failure_count}",
            f"Total cost: ${total_cost:.6f}",
            f"Execution time: {execution_time:.2f} seconds",
            "\nChanges by field:",
            *[f"- {field}: {count}" for field, count in changes_by_field.items() if count > 0],
            "\n=== End of Summary ===\n"
        ]
        
        # Log to both console and summary file
        for line in summary:
            logger.info(line)
            summary_logger.info(line)

    except Exception as e:
        error_msg = f"An unexpected error occurred: {str(e)}"
        logger.error(error_msg)
        summary_logger.error(error_msg)

if __name__ == "__main__":
    main()