Spaces:

poemsforaphrodite
/

rag_ielts

Sleeping

App Files Files Community

poemsforaphrodite commited on Jan 25

Commit

366b3ed

verified ·

1 Parent(s): 9b72370

Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

.github/workflows/update_space.yml +28 -0
.gitignore +4 -0
README.md +76 -8
app.py +1500 -0
fix.py +349 -0
requirements.txt +9 -0

.github/workflows/update_space.yml ADDED Viewed

	@@ -0,0 +1,28 @@

+name: Run Python script
+on:
+  push:
+    branches:
+      - main
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v2
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.9'
+    - name: Install Gradio
+      run: python -m pip install gradio
+    - name: Log in to Hugging Face
+      run: python -c 'import huggingface_hub; huggingface_hub.login(token="${{ secrets.hf_token }}")'
+    - name: Deploy to Spaces
+      run: gradio deploy

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+/.env
+/.pytest_cache
+/generated_pdfs/
+main.py

README.md CHANGED Viewed

@@ -1,12 +1,80 @@
 ---
-title: Rag Ielts
-emoji: 📊
-colorFrom: yellow
-colorTo: blue
-sdk: gradio
-sdk_version: 5.13.1
 app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: rag_ielts
 app_file: app.py
+sdk: gradio
+sdk_version: 4.44.1
 ---
+# Exam Content Management System
+A Streamlit application for managing and generating exam content using Azure OpenAI and MongoDB.
+## Features
+- Upload and process PDF files containing exam content
+- Generate questions based on various criteria
+- Store and manage exam content in MongoDB
+- Support for IELTS, TOEFL, and SAT exam types
+- Beautiful UI with Streamlit
+## Prerequisites
+- Python 3.8+
+- MongoDB database
+- Azure OpenAI API access
+## Installation
+1. Clone the repository:
+```bash
+git clone <repository-url>
+cd <repository-directory>
+```
+2. Install dependencies:
+```bash
+pip install -r requirements.txt
+```
+3. Set up environment variables:
+   - Copy `.env.example` to `.env`
+   - Fill in your Azure OpenAI credentials
+   - Add your MongoDB connection details
+## MongoDB Setup
+1. Create a MongoDB database (either local or cloud-hosted like MongoDB Atlas)
+2. Update the `.env` file with your MongoDB connection string:
+```
+MONGODB_URI=mongodb://username:password@host:port/database
+MONGODB_DB=exam_content_db
+```
+## Running the Application
+1. Start the Streamlit app:
+```bash
+streamlit run main.py
+```
+2. Open your browser and navigate to the URL shown in the terminal (usually http://localhost:8501)
+## Usage
+1. Upload Content:
+   - Select exam type
+   - Upload PDF files containing exam content
+   - Process the uploads
+2. Generate Questions:
+   - Choose exam type, section, and other criteria
+   - Select or generate reading passages
+   - Generate questions
+   - Download generated content as PDF
+## Contributing
+Contributions are welcome! Please feel free to submit a Pull Request.
+## License
+This project is licensed under the MIT License - see the LICENSE file for details.

app.py ADDED Viewed

	@@ -0,0 +1,1500 @@

+import os
+import streamlit as st
+import PyPDF2
+from openai import AzureOpenAI
+import uuid
+from typing import List, Dict, Any, Optional
+from supabase import create_client, Client
+from dotenv import load_dotenv
+import json
+import re
+from io import BytesIO
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from queue import Queue
+import threading
+from pydantic import BaseModel, Field
+import logging
+import pandas as pd
+import plotly.express as px
+import subprocess
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+# Load environment variables from .env file (if present)
+load_dotenv()
+# Constants
+EXAM_TYPES = ["SAT", "IELTS", "TOEFL"]
+DIFFICULTY_LEVELS = ["Easy", "Medium", "Hard", "Very Hard"]
+class ExamQuestion(BaseModel):
+    exam_type: str
+    content_type: str = "Generated"
+    exam_section: str
+    domain: str
+    subdomain: str
+    topic: str
+    difficulty_level: str = "Medium"
+    reading_passage: str
+    reading_passage_title: Optional[str] = None
+    question_text: str
+    option_a: str
+    option_b: str
+    option_c: str
+    option_d: str
+    correct_answer: str
+    explanation: str
+    is_active: bool = True
+class ExamQuestionResponse(BaseModel):
+    questions: List[ExamQuestion]
+# Set up Azure OpenAI client
+try:
+    API_KEY = os.getenv("AZURE_OPENAI_KEY")
+    ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
+    DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")
+    if not API_KEY or not ENDPOINT or not DEPLOYMENT_NAME:
+        raise ValueError("Azure OpenAI configuration is incomplete. Please set AZURE_OPENAI_KEY, AZURE_OPENAI_ENDPOINT, and AZURE_OPENAI_DEPLOYMENT_NAME in the environment variables.")
+    logging.info(f"Using Azure OpenAI Configuration: Endpoint={ENDPOINT}, Deployment Name={DEPLOYMENT_NAME}")
+    logging.info(f"Azure OpenAI Configuration: Endpoint={ENDPOINT}, Deployment Name={DEPLOYMENT_NAME}, API Key={API_KEY[:4]}... (masked)")
+    client = AzureOpenAI(
+        api_key=API_KEY,
+        api_version="2024-02-15-preview",
+        azure_endpoint=ENDPOINT
+    )
+    logging.info("Azure OpenAI client initialized successfully.")
+except ValueError as ve:
+    logging.error(f"Configuration Error: {ve}")
+    st.error("Azure OpenAI configuration is incomplete. Please check the environment variables.")
+except Exception as e:
+    logging.error(f"Failed to initialize Azure OpenAI client: {e}")
+    st.error(f"Failed to initialize Azure OpenAI client: {str(e)}")
+# Set up Supabase client
+SUPABASE_URL = os.getenv("SUPABASE_DB_URL")
+SUPABASE_API_KEY = os.getenv("SUPABASE_API_KEY")
+if not SUPABASE_URL or not SUPABASE_API_KEY:
+    raise ValueError("Supabase URL and API Key must be set in environment variables.")
+supabase: Client = create_client(SUPABASE_URL, SUPABASE_API_KEY)
+# Create a thread-safe queue for logging
+log_queue = Queue()
+def safe_st_warning(message: str):
+    """Thread-safe way to queue warning messages"""
+    log_queue.put(("warning", message))
+def safe_st_error(message: str):
+    """Thread-safe way to queue error messages"""
+    log_queue.put(("error", message))
+# Define the domain structures
+domain_structures = {
+    "SAT": """SAT Domains and Subdomains:
+1. Reading and Writing:
+    - Information and Ideas:
+        * Central Ideas and Details
+        * Command of Textual Evidence
+        * Command of Quantitative Evidence
+        * Inferences
+        * Words in Context
+    - Craft and Structure:
+        * Text Structure and Purpose
+        * Cross-Text Connections
+        * Rhetorical Synthesis
+        * Boundaries
+        * Transitions
+2. Mathematics:
+    - Algebra:
+        * Linear equations in one variable
+        * Linear equations in two variables
+        * Linear functions
+        * Systems of two linear equations in two variables
+        * Linear inequalities in one or two variables
+    - Advanced Mathematics:
+        * Equivalent expressions
+        * Nonlinear equations in one variable and systems of equations in two variables
+        * Nonlinear functions
+    - Problem Solving and Data Analysis:
+        * Ratios, rates, proportional relationships, and units
+        * Percentages
+        * One-variable data: distributions and measures of center and spread
+        * Two-variable data: models and scatterplots
+        * Probability and conditional probability
+        * Inference from sample statistics and margin of error
+        * Evaluating statistical claims: observational studies and experiments
+    - Geometry and Trigonometry:
+        * Area and volume
+        * Lines, angles, and triangles
+        * Right triangles and trigonometry
+        * Circles""",
+    "IELTS": """IELTS Domains and Subdomains:
+1. Reading:
+    - Information Location:
+        * Scanning for Details
+        * Skimming for Main Ideas
+        * Locating Specific Information
+        * Finding Supporting Evidence
+    - Critical Analysis:
+        * Author's Purpose
+        * Text Organization
+        * Opinion and Attitude
+        * Argument Analysis
+    - Vocabulary and Reference:
+        * Word Meaning in Context
+        * Reference Words
+        * Paraphrase Recognition
+        * Academic Vocabulary
+2. Writing:
+    - Task Analysis:
+        * Data Interpretation
+        * Process Description
+        * Compare and Contrast
+        * Problem and Solution
+    - Essay Development:
+        * Argument Construction
+        * Evidence Support
+        * Coherence and Cohesion
+        * Academic Style
+    - Language Control:
+        * Grammar Range
+        * Vocabulary Usage
+        * Sentence Structure
+        * Punctuation
+3. Speaking:
+    - Personal Expression:
+        * Self Introduction
+        * Personal Experience
+        * Opinion Expression
+        * Future Plans
+    - Topic Development:
+        * Extended Discourse
+        * Topic Analysis
+        * Example Provision
+        * Abstract Discussion
+    - Communication Skills:
+        * Fluency and Coherence
+        * Pronunciation
+        * Interactive Communication
+        * Response Relevance
+4. Listening:
+    - Academic Understanding:
+        * Lecture Comprehension
+        * Discussion Analysis
+        * Main Points Identification
+        * Detail Recognition
+    - Pragmatic Understanding:
+        * Speaker Attitude
+        * Function of Utterances
+        * Degree of Certainty
+        * Speaker Relationship
+    - Connecting Information:
+        * Information Organization
+        * Connecting Content
+        * Understanding Examples
+        * Making Inferences
+5. Speaking:
+    - Independent Tasks:
+        * Opinion Expression
+        * Personal Experience
+        * Preference Justification
+        * Choice Explanation
+    - Integrated Tasks:
+        * Lecture Summary
+        * Reading-Listening Integration
+        * Campus Situation Response
+        * Academic Topic Discussion
+    - Delivery Skills:
+        * Pronunciation
+        * Intonation
+        * Rhythm and Pacing
+        * Natural Flow
+6. Writing:
+    - Independent Writing:
+        * Essay Organization
+        * Thesis Development
+        * Evidence Support
+        * Conclusion Writing
+    - Integrated Writing:
+        * Source Integration
+        * Information Synthesis
+        * Accurate Reporting
+        * Response Organization
+    - Language Control:
+        * Grammar Accuracy
+        * Vocabulary Range
+        * Sentence Variety
+        * Academic Style""",
+    "TOEFL": """TOEFL Domains and Subdomains:
+1. Reading:
+    - Comprehension:
+        * Main Idea and Details
+        * Inference Making
+        * Author's Purpose
+        * Vocabulary in Context
+    - Analysis:
+        * Text Organization
+        * Information Integration
+        * Argument Evaluation
+        * Evidence Assessment
+    - Academic Skills:
+        * Paraphrase Recognition
+        * Summary Skills
+        * Table Completion
+        * Classification
+2. Listening:
+    - Academic Understanding:
+        * Lecture Comprehension
+        * Discussion Analysis
+        * Main Points Identification
+        * Detail Recognition
+    - Pragmatic Understanding:
+        * Speaker Attitude
+        * Function of Utterances
+        * Degree of Certainty
+        * Speaker Relationship
+    - Connecting Information:
+        * Information Organization
+        * Connecting Content
+        * Understanding Examples
+        * Making Inferences
+3. Speaking:
+    - Independent Tasks:
+        * Opinion Expression
+        * Personal Experience
+        * Preference Justification
+        * Choice Explanation
+    - Integrated Tasks:
+        * Lecture Summary
+        * Reading-Listening Integration
+        * Campus Situation Response
+        * Academic Topic Discussion
+    - Delivery Skills:
+        * Pronunciation
+        * Intonation
+        * Rhythm and Pacing
+        * Natural Flow
+4. Writing:
+    - Independent Writing:
+        * Essay Organization
+        * Thesis Development
+        * Evidence Support
+        * Conclusion Writing
+    - Integrated Writing:
+        * Source Integration
+        * Information Synthesis
+        * Accurate Reporting
+        * Response Organization
+    - Language Control:
+        * Grammar Accuracy
+        * Vocabulary Range
+        * Sentence Variety
+        * Academic Style"""
+}
+def clean_text(text: str) -> str:
+    """Clean extracted text from PDF."""
+    # Remove OCR artifacts and fix common issues
+    text = re.sub(r'\.{3,}', '...', text)  # Replace multiple dots with ellipsis
+    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with single space
+    text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)  # Add space between joined words
+    text = re.sub(r'(\d+)\.(\d+)', r'\1. \2', text)  # Fix numbered lists
+    text = text.replace('..............', '')  # Remove dot lines
+    text = re.sub(r'Line\s+\d+', '', text)  # Remove line numbers
+    text = re.sub(r'Page\s+\d+', '', text)  # Remove page numbers
+    text = re.sub(r'CONTINUE\s+\d+', '', text)  # Remove continue markers
+    text = re.sub(r'Unauthorized.*illegal\.', '', text)  # Remove copyright notices
+    text = text.replace('©', '(c)')  # Replace copyright symbol
+    # Fix common OCR issues
+    text = re.sub(r'(?<=\d)\s+(?=\d)', '', text)  # Remove spaces between numbers
+    text = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text)  # Add space between words
+    text = re.sub(r'(?<=\w)\.(?=\w)', '. ', text)  # Add space after period
+    text = re.sub(r'(?<=\w),(?=\w)', ', ', text)  # Add space after comma
+    text = re.sub(r'arebasedonthe', ' are based on the ', text)  # Fix common OCR error
+    text = re.sub(r'Questions\d+\-\d+', 'Questions ', text)  # Clean question numbers
+    text = re.sub(r'\s*\n\s*', '\n', text)  # Clean up newlines
+    text = re.sub(r'\n{3,}', '\n\n', text)  # Reduce multiple newlines
+    # Clean up percentage signs and numbers in tables
+    text = re.sub(r'(\d+)\s*\.\s*(\d+)', r'\1.\2', text)  # Fix decimal numbers
+    text = re.sub(r'(\d+)\s*%', r'\1%', text)  # Fix percentage signs
+    return text.strip()
+def extract_text_from_pdf(pdf_file) -> List[str]:
+    """
+    Extracts text from a PDF file in overlapping 3-page chunks.
+    For example, for a 6-page PDF:
+    - Chunk 1: Pages 1-2-3
+    - Chunk 2: Pages 2-3-4
+    - Chunk 3: Pages 3-4-5
+    - Chunk 4: Pages 4-5-6
+    Args:
+        pdf_file: Uploaded PDF file.
+    Returns:
+        List of text chunks, each containing 3 pages with overlap.
+    """
+    reader = PyPDF2.PdfReader(pdf_file)
+    num_pages = len(reader.pages)
+    text_chunks = []
+    chunk_size = 3  # Fixed size of 3 pages per chunk
+    # Create overlapping chunks
+    for chunk_start in range(0, max(1, num_pages - chunk_size + 1)):
+        text = ""
+        # Calculate end page for this chunk (inclusive)
+        chunk_end = min(chunk_start + chunk_size, num_pages)
+        # Extract text from all pages in this chunk
+        for page_num in range(chunk_start, chunk_end):
+            page = reader.pages[page_num]
+            page_text = page.extract_text()
+            if page_text:
+                text += f"\n--- Page {page_num + 1} ---\n{clean_text(page_text)}\n"
+        if text.strip():  # Ensure non-empty
+            text_chunks.append(clean_text(text))
+    return text_chunks
+def clean_json_string(text: str) -> str:
+    """
+    Clean and extract JSON from the response text.
+    """
+    # Try to find JSON array or object pattern
+    json_match = re.search(r'(\[|\{).*(\]|\})', text, re.DOTALL)
+    if json_match:
+        potential_json = json_match.group(0)
+        # Remove any markdown code block syntax
+        potential_json = re.sub(r'```json\s*|\s*```', '', potential_json)
+        # Remove any trailing commas before closing brackets
+        potential_json = re.sub(r',(\s*[\}\]])', r'\1', potential_json)
+        return potential_json
+    return text
+def process_chunk(chunk: str, exam_type: str, idx: int, structure: str) -> List[Dict[str, Any]]:
+    """
+    Process a single text chunk to generate multiple questions using Azure OpenAI model with structured output.
+    """
+    # First, clean the text using LLM
+    clean_text_prompt = f"""Clean and format the following text while preserving its meaning and structure.
+    Guidelines:
+    1. Fix OCR artifacts and formatting issues
+    2. Add proper spacing between words
+    3. Fix line breaks and paragraphs
+    4. Preserve all content including tables and data
+    5. Keep all numerical values and statistics
+    6. Maintain academic/formal style
+    7. Keep the text exactly as is, just make it readable
+    8. Do not summarize or modify the content
+    9. Do not add or remove information
+    10. Keep all citations and references
+    Text to clean:
+    {chunk}
+    Return ONLY the cleaned text with no additional comments or explanations."""
+    try:
+        clean_response = client.chat.completions.create(
+            model=DEPLOYMENT_NAME,
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a text cleaning expert. Your job is to fix formatting and OCR issues while preserving the exact content and meaning of the text."
+                },
+                {"role": "user", "content": clean_text_prompt}
+            ],
+            temperature=0.0,  # Use 0 temperature for consistent cleaning
+        )
+        cleaned_chunk = clean_response.choices[0].message.content.strip()
+        # Now proceed with question generation using the cleaned text
+        prompt = f"""Generate multiple {exam_type} exam questions based on the provided text. You MUST generate at least 3 questions for each chunk of text. Return ONLY a JSON array of questions.
+Domain Structure:
+{structure}
+IMPORTANT: You MUST include ALL of the following fields for each question. Missing fields will cause errors:
+- exam_type: The type of exam (e.g., "{exam_type}")
+- content_type: Set to "Generated"
+- exam_section: The lowercase exam type (e.g., "{exam_type.lower()}")
+- domain: The main domain from the structure (e.g., "Reading and Writing")
+- subdomain: The subdomain from the structure (e.g., "Information and Ideas")
+- topic: The specific topic from the structure (e.g., "Central Ideas and Details")
+- difficulty_level: One of ["Easy", "Medium", "Hard", "Very Hard"]
+- reading_passage: The COMPLETE, cleaned passage text
+- reading_passage_title: The title if available, or null
+- question_text: The actual question (REQUIRED)
+- option_a: First option (REQUIRED)
+- option_b: Second option (REQUIRED)
+- option_c: Third option (REQUIRED)
+- option_d: Fourth option (REQUIRED)
+- correct_answer: Must be "A", "B", "C", or "D" (REQUIRED)
+- explanation: Detailed explanation of the correct answer (REQUIRED)
+- is_active: Set to true
+Instructions for Multiple Questions:
+1. You MUST generate at least 3 questions for this text chunk
+2. Each question should focus on a different aspect or detail from the text
+3. Use different subdomains and topics for variety
+4. Vary the difficulty levels across questions
+5. Make sure each question tests a different skill or concept
+6. Questions should build on each other but be independently answerable
+7. Use a mix of question types:
+   - Main idea questions
+   - Detail questions
+   - Inference questions
+   - Vocabulary in context
+   - Purpose questions
+   - Structure questions
+   - Evidence questions
+For each question:
+- Use the EXACT, COMPLETE passage text provided - do not modify, summarize, or shorten it
+- Ensure questions are directly related to and answerable from the passage content
+- Questions should test understanding of key concepts, details, or relationships presented in the passage
+- All answer options should be plausible but with only one clearly correct answer
+- The explanation should reference specific parts of the passage to justify the correct answer
+IMPORTANT:
+- You MUST return an array with AT LEAST 3 questions
+- Each question must have ALL required fields
+- Questions must be diverse in type and difficulty
+- All questions must be directly answerable from the passage
+- The reading_passage must be identical for all questions from the same chunk
+Text to analyze:
+\"\"\"
+{cleaned_chunk}
+\"\"\""""
+        response = client.chat.completions.create(
+            model=DEPLOYMENT_NAME,
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are an expert exam question generator. You MUST generate at least 3 complete questions with ALL required fields for each text chunk. Never omit any fields. Ensure proper JSON formatting. Vary the question types and difficulty levels."
+                },
+                {"role": "user", "content": prompt}
+            ],
+            response_format={"type": "json_object"},
+            temperature=0.1,
+        )
+        # Parse the response content
+        content = response.choices[0].message.content.strip()
+        # Clean and parse JSON
+        cleaned_json = clean_json_string(content)
+        try:
+            parsed_data = json.loads(cleaned_json)
+            # Validate each question has all required fields before creating ExamQuestionResponse
+            required_fields = [
+                "exam_type", "content_type", "exam_section", "domain", "subdomain",
+                "topic", "difficulty_level", "reading_passage", "question_text",
+                "option_a", "option_b", "option_c", "option_d", "correct_answer",
+                "explanation", "is_active"
+            ]
+            if isinstance(parsed_data, list):
+                questions = parsed_data
+            elif isinstance(parsed_data, dict) and "questions" in parsed_data:
+                questions = parsed_data["questions"]
+            else:
+                questions = [parsed_data]
+            # Validate each question
+            valid_questions = []
+            for q in questions:
+                missing_fields = [f for f in required_fields if f not in q or not q[f]]
+                if not missing_fields:
+                    valid_questions.append(q)
+                else:
+                    logging.warning(f"Skipping question due to missing fields: {missing_fields}")
+            if len(valid_questions) < 3:
+                logging.warning(f"Generated only {len(valid_questions)} valid questions, expected at least 3")
+            if valid_questions:
+                response_data = ExamQuestionResponse(questions=valid_questions)
+                return [question.model_dump() for question in response_data.questions]
+            else:
+                logging.error("No valid questions found after validation")
+                return []
+        except json.JSONDecodeError as je:
+            logging.error(f"JSON parsing error in chunk {idx + 1}: {str(je)}")
+            logging.error(f"Problematic JSON: {cleaned_json[:500]}...")
+            return []
+        except Exception as e:
+            logging.error(f"Error validating questions: {str(e)}")
+            return []
+    except Exception as e:
+        logging.error(f"Error processing chunk {idx + 1}: {str(e)}")
+        safe_st_error(f"Error generating questions for chunk {idx + 1}: {str(e)}")
+        return []
+def generate_questions(text_chunks: List[str], exam_type: str) -> List[Dict[str, Any]]:
+    """
+    Generates questions for each text chunk using DeepSeek and returns structured JSON.
+    Uses multithreading to process chunks concurrently.
+    """
+    questions = []
+    structure = domain_structures.get(exam_type, "")
+    # Create progress tracking elements in the main thread
+    progress_placeholder = st.empty()
+    status_placeholder = st.empty()
+    metrics_placeholder = st.empty()
+    # Process chunks concurrently
+    with ThreadPoolExecutor() as executor:
+        futures = [
+            executor.submit(process_chunk, chunk, exam_type, idx, structure)
+            for idx, chunk in enumerate(text_chunks)
+        ]
+        completed = 0
+        total = len(text_chunks)
+        total_questions = 0
+        # Process results as they complete
+        for future in as_completed(futures):
+            try:
+                chunk_questions = future.result()
+                questions.extend(chunk_questions)
+                total_questions += len(chunk_questions)
+                # Update progress in the main thread
+                completed += 1
+                progress = completed / total
+                # Update UI elements
+                progress_placeholder.progress(progress)
+                status_placeholder.text(f"Processing chunks: {completed}/{total}")
+                metrics_placeholder.metric(
+                    label="Progress",
+                    value=f"{completed}/{total} chunks",
+                    delta=f"{total_questions} questions generated"
+                )
+                # Process any queued messages
+                while not log_queue.empty():
+                    msg_type, message = log_queue.get()
+                    if msg_type == "warning":
+                        st.warning(message)
+                    elif msg_type == "error":
+                        st.error(message)
+            except Exception as e:
+                st.error(f"Error processing chunk: {str(e)}")
+    # Show final summary
+    st.success(f"✅ Processing complete! Generated {total_questions} questions from {total} chunks.")
+    # Clear progress tracking elements
+    progress_placeholder.empty()
+    status_placeholder.empty()
+    metrics_placeholder.empty()
+    return questions
+def upload_questions_to_supabase(generated_questions: List[Dict[str, Any]], source_file: str):
+    """
+    Uploads generated questions to Supabase.
+    Args:
+        generated_questions: List of question dictionaries.
+        source_file: Name of the source PDF file.
+    """
+    # Create progress tracking for uploads
+    st.write("### Upload Progress")
+    upload_progress = st.progress(0)
+    upload_status = st.empty()
+    total = len(generated_questions)
+    successful_uploads = 0
+    failed_uploads = 0
+    for idx, question in enumerate(generated_questions):
+        # Generate a new valid UUID regardless of what was provided
+        new_uuid = str(uuid.uuid4())
+        # Set default values if not present and match the table schema
+        question_fields = {
+            "id": new_uuid,  # Always use our generated UUID
+            "exam_type": question.get("exam_type", "Unknown"),
+            "content_type": question.get("content_type", "Generated"),
+            "exam_section": question.get("exam_section") or question.get("exam_type", "Unknown").lower(),
+            "domain": question.get("domain", "General"),
+            "subdomain": question.get("subdomain", "General"),
+            "topic": question.get("topic", "General"),
+            "difficulty_level": question.get("difficulty_level"),
+            "reading_passage": question.get("reading_passage"),
+            "question_text": question.get("question_text", "Not Available"),
+            "option_a": question.get("option_a"),
+            "option_b": question.get("option_b"),
+            "option_c": question.get("option_c"),
+            "option_d": question.get("option_d"),
+            "correct_answer": question.get("correct_answer", "Not Available"),
+            "explanation": question.get("explanation"),
+            "source_file": source_file,
+            "is_active": question.get("is_active", True),
+            "metadata": json.dumps(question.get("metadata")) if question.get("metadata") else None,
+            "source_text": question.get("source_text")
+        }
+        try:
+            # Insert the question and get the response
+            response = supabase.table("exam_contents").insert(question_fields).execute()
+            # Check if the response data indicates success
+            if response.data:
+                successful_uploads += 1
+            else:
+                failed_uploads += 1
+                st.warning(f"Failed to insert question: {response.error}")
+        except Exception as e:
+            failed_uploads += 1
+            st.error(f"Error uploading question: {str(e)}")
+        # Update progress
+        progress = (idx + 1) / total
+        upload_progress.progress(progress)
+        upload_status.text(f"Uploading questions: {idx + 1}/{total} (Success: {successful_uploads}, Failed: {failed_uploads})")
+    # Show final upload summary
+    if failed_uploads == 0:
+        st.success(f"✅ Upload complete! Successfully uploaded all {successful_uploads} questions.")
+    else:
+        st.warning(f"⚠️ Upload complete with some issues. Successful: {successful_uploads}, Failed: {failed_uploads}")
+    # Clear progress elements
+    upload_progress.empty()
+    upload_status.empty()
+def process_pdfs(pdf_files, exam_type):
+    """
+    Process multiple PDF files and generate questions.
+    Args:
+        pdf_files: List of uploaded PDF files
+        exam_type: Selected exam type
+    Returns:
+        Combined questions JSON and download content
+    """
+    all_questions = []
+    progress_text = st.empty()
+    progress_bar = st.progress(0)
+    for i, pdf_file in enumerate(pdf_files):
+        progress_text.text(f"Processing file {i+1}/{len(pdf_files)}: {pdf_file.name}")
+        # Convert bytes to file-like object if necessary
+        if isinstance(pdf_file, bytes):
+            pdf_file_obj = BytesIO(pdf_file)
+        else:
+            pdf_file_obj = pdf_file
+        # Extract text
+        text_chunks = extract_text_from_pdf(pdf_file_obj)
+        if not text_chunks:
+            st.warning(f"No text extracted from {pdf_file.name}")
+            continue
+        # Generate questions
+        file_questions = generate_questions(text_chunks, exam_type)
+        if file_questions:
+            all_questions.extend(file_questions)
+            # Upload to Supabase
+            source_file = pdf_file.name
+            upload_questions_to_supabase(file_questions, source_file)
+        # Update progress
+        progress_bar.progress((i + 1) / len(pdf_files))
+    progress_text.empty()
+    progress_bar.empty()
+    if not all_questions:
+        st.warning("No questions were generated from any of the files.")
+        return None, None
+    # Prepare JSON output
+    combined_questions_json = json.dumps(all_questions, indent=4)
+    return combined_questions_json, combined_questions_json.encode('utf-8')
+def get_questions(filters=None):
+    """Fetch questions from Supabase with optional filters."""
+    try:
+        query = supabase.table("exam_contents").select("*")
+        if filters:
+            for key, value in filters.items():
+                if value and value != "All":
+                    query = query.eq(key, value)
+        response = query.execute()
+        return response.data
+    except Exception as e:
+        logging.error(f"Error fetching questions: {e}")
+        return []
+def get_analytics_data(questions):
+    """Generate analytics data from questions."""
+    df = pd.DataFrame(questions)
+    analytics = {
+        'total_questions': len(df),
+        'unfixed_questions': len([q for q in questions if not q.get('is_fixed', False)])
+    }
+    # Basic statistics
+    if 'exam_type' in df.columns:
+        analytics['questions_by_exam'] = df['exam_type'].value_counts()
+    else:
+        analytics['questions_by_exam'] = pd.Series(dtype='int64')
+    if 'difficulty_level' in df.columns:
+        analytics['questions_by_difficulty'] = df['difficulty_level'].value_counts()
+    else:
+        analytics['questions_by_difficulty'] = pd.Series(dtype='int64')
+    if 'domain' in df.columns:
+        analytics['questions_by_domain'] = df['domain'].value_counts()
+    else:
+        analytics['questions_by_domain'] = pd.Series(dtype='int64')
+    # Include exam_type in the domain/subdomain grouping
+    if all(col in df.columns for col in ['exam_type', 'domain', 'subdomain']):
+        analytics['questions_by_subdomain'] = df.groupby(['exam_type', 'domain', 'subdomain']).size().reset_index(name='count')
+    else:
+        analytics['questions_by_subdomain'] = pd.DataFrame(columns=['exam_type', 'domain', 'subdomain', 'count'])
+    # Time-based analytics
+    if 'created_at' in df.columns:
+        df['created_at'] = pd.to_datetime(df['created_at'])
+        analytics['questions_by_date'] = df.resample('D', on='created_at').size()
+        analytics['questions_by_month'] = df.resample('M', on='created_at').size()
+        analytics['recent_activity'] = df.sort_values('created_at', ascending=False).head(10)
+    # Content coverage analysis
+    if 'reading_passage' in df.columns:
+        analytics['has_passage'] = df['reading_passage'].notna().sum()
+        analytics['passage_ratio'] = (df['reading_passage'].notna().sum() / len(df)) * 100 if len(df) > 0 else 0
+        # Calculate average passage length
+        df['passage_length'] = df['reading_passage'].str.len().fillna(0)
+        analytics['avg_passage_length'] = df['passage_length'].mean()
+        analytics['passage_length_dist'] = df['passage_length'].describe()
+    # Question quality metrics
+    if 'explanation' in df.columns:
+        analytics['has_explanation'] = df['explanation'].notna().sum()
+        analytics['explanation_ratio'] = (df['explanation'].notna().sum() / len(df)) * 100 if len(df) > 0 else 0
+        # Calculate explanation comprehensiveness
+        df['explanation_length'] = df['explanation'].str.len().fillna(0)
+        analytics['avg_explanation_length'] = df['explanation_length'].mean()
+        analytics['explanation_length_dist'] = df['explanation_length'].describe()
+    # Options analysis
+    option_cols = ['option_a', 'option_b', 'option_c', 'option_d']
+    if all(col in df.columns for col in option_cols):
+        df['options_count'] = df[option_cols].notna().sum(axis=1)
+        analytics['complete_options'] = (df['options_count'] == 4).sum()
+        analytics['options_ratio'] = (analytics['complete_options'] / len(df)) * 100 if len(df) > 0 else 0
+    # Domain coverage analysis
+    if all(col in df.columns for col in ['exam_type', 'domain', 'subdomain']):
+        domain_coverage = df.groupby(['exam_type', 'domain'])['subdomain'].nunique().reset_index()
+        domain_coverage.columns = ['exam_type', 'domain', 'unique_subdomains']
+        analytics['domain_coverage'] = domain_coverage
+        # Calculate domain balance score (0-100) per exam type
+        domain_balance_scores = []
+        for exam_type in df['exam_type'].unique():
+            exam_domain_counts = df[df['exam_type'] == exam_type]['domain'].value_counts()
+            if not exam_domain_counts.empty:
+                max_count = exam_domain_counts.max()
+                min_count = exam_domain_counts.min()
+                score = ((1 - (max_count - min_count) / max_count) * 100) if max_count > 0 else 100
+                domain_balance_scores.append({'exam_type': exam_type, 'balance_score': score})
+        analytics['domain_balance_by_exam'] = pd.DataFrame(domain_balance_scores)
+        analytics['domain_balance_score'] = analytics['domain_balance_by_exam']['balance_score'].mean()
+    return analytics
+def rewrite_question(question: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Use LLM to rewrite the question, passage, and options while maintaining the same concept.
+    """
+    prompt = f"""Rewrite the following exam question with a new passage and options. Keep the same concept, difficulty level, and correct answer position, but create fresh content.
+Current Question:
+Reading Passage: {question.get('reading_passage', '')}
+Question: {question.get('question_text', '')}
+Options:
+A) {question.get('option_a', '')}
+B) {question.get('option_b', '')}
+C) {question.get('option_c', '')}
+D) {question.get('option_d', '')}
+Correct Answer: {question.get('correct_answer', '')}
+Explanation: {question.get('explanation', '')}
+IMPORTANT LENGTH REQUIREMENTS:
+- Reading passage must be AT LEAST 100 characters (preferably 200-300)
+- Question text must be AT LEAST 50 characters
+- Options can be concise but clear (no minimum length)
+- Explanation must be AT LEAST 50 characters
+Requirements:
+1. Create a new reading passage that:
+   - Must be AT LEAST 100 characters (preferably 200-300)
+   - Covers the same concepts in detail
+   - Maintains similar complexity
+   - Uses rich context and examples
+2. Write a detailed question that:
+   - Must be AT LEAST 50 characters
+   - Clearly states what is being asked
+   - Includes necessary context
+3. Create clear options that:
+   - Are concise but clear
+   - Are distinct from each other
+   - Follow a similar format
+   - Maintain the correct answer in the same position
+4. Write a good explanation that:
+   - Must be AT LEAST 50 characters
+   - Explains the correct answer
+   - Provides clear reasoning
+   - References the passage when relevant
+Return ONLY a JSON object with the following structure:
+{{
+    "reading_passage": "new_passage (MINIMUM 100 characters)",
+    "question_text": "new_question (MINIMUM 50 characters)",
+    "option_a": "new_option_a (concise)",
+    "option_b": "new_option_b (concise)",
+    "option_c": "new_option_c (concise)",
+    "option_d": "new_option_d (concise)",
+    "explanation": "new_explanation (MINIMUM 50 characters)"
+}}"""
+    try:
+        response = client.chat.completions.create(
+            model=DEPLOYMENT_NAME,
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are an expert at rewriting exam questions. Create a detailed reading passage (100+ chars) and clear question (50+ chars). Options should be concise but clear. Explanation should be thorough (50+ chars)."
+                },
+                {"role": "user", "content": prompt}
+            ],
+            response_format={"type": "json_object"},
+            temperature=0.7,
+        )
+        # Parse the response
+        new_content = json.loads(response.choices[0].message.content)
+        # Validate minimum length requirements with detailed error messages
+        length_requirements = {
+            'reading_passage': 100,
+            'question_text': 50,
+            'explanation': 50
+        }
+        errors = []
+        for key, min_length in length_requirements.items():
+            value = new_content.get(key, '')
+            current_length = len(value)
+            if current_length < min_length:
+                errors.append(f"{key} is too short: {current_length} chars (minimum {min_length} required)")
+        if errors:
+            error_message = "\n".join(errors)
+            raise ValueError(f"Content length requirements not met:\n{error_message}")
+        # Update the question with new content while preserving other fields
+        updated_question = question.copy()
+        updated_question.update(new_content)
+        return updated_question
+    except json.JSONDecodeError as je:
+        error_msg = f"Invalid JSON response from LLM: {str(je)}"
+        logging.error(error_msg)
+        raise ValueError(error_msg)
+    except Exception as e:
+        logging.error(f"Error rewriting question: {str(e)}")
+        raise e
+def display_question(question, index):
+    """Display a single question with its details."""
+    with st.expander(f"Question {index + 1}", expanded=index == 0):
+        # Add delete and rewrite buttons in the top right corner
+        col1, col2, col3 = st.columns([5, 1, 1])
+        with col2:
+            if st.button("🔄 Rewrite", key=f"rewrite_{question['id']}", type="primary"):
+                try:
+                    with st.spinner("Rewriting question..."):
+                        # Rewrite the question
+                        updated_question = rewrite_question(question)
+                        # Update in Supabase
+                        supabase.table("exam_contents").update(updated_question).eq("id", question['id']).execute()
+                        st.success("Question rewritten successfully!")
+                        # Refresh the page
+                        st.rerun()
+                except Exception as e:
+                    st.error(f"Error rewriting question: {str(e)}")
+        with col3:
+            if st.button("🗑️ Delete", key=f"delete_{question['id']}", type="secondary"):
+                try:
+                    # Delete from Supabase
+                    supabase.table("exam_contents").delete().eq("id", question['id']).execute()
+                    st.success("Question deleted successfully!")
+                    # Add a rerun to refresh the page
+                    st.rerun()
+                except Exception as e:
+                    st.error(f"Error deleting question: {str(e)}")
+        # Metadata
+        with col1:
+            col_a, col_b, col_c, col_d = st.columns(4)
+            with col_a:
+                st.markdown(f"**Domain:** {question.get('domain', 'N/A')}")
+            with col_b:
+                st.markdown(f"**Subdomain:** {question.get('subdomain', 'N/A')}")
+            with col_c:
+                st.markdown(f"**Topic:** {question.get('topic', 'N/A')}")
+            with col_d:
+                st.markdown(f"**Difficulty:** {question.get('difficulty_level', 'N/A')}")
+        # Reading passage if available
+        if question.get('reading_passage'):
+            st.markdown("### 📖 Reading Passage")
+            st.markdown(
+                f"""<div style='background-color: #f0f2f6; padding: 20px; border-radius: 10px; margin: 10px 0; color: #1f1f1f;'>
+                {question['reading_passage']}
+                </div>""",
+                unsafe_allow_html=True
+            )
+        # Question text and options
+        st.markdown("### ❓ Question")
+        st.markdown(f"{question.get('question_text', '')}")
+        if any(question.get(f'option_{opt}') for opt in ['a', 'b', 'c', 'd']):
+            st.markdown("### Options")
+            options_container = st.container()
+            with options_container:
+                for opt in ['a', 'b', 'c', 'd']:
+                    if question.get(f'option_{opt}'):
+                        st.markdown(f"**{opt.upper()}.** {question[f'option_{opt}']}")
+        # Answer and explanation
+        st.markdown("### Answer & Explanation")
+        col1, col2 = st.columns(2)
+        with col1:
+            st.markdown(
+                f"""<div style='background-color: #e8f4ea; padding: 10px; border-radius: 5px; margin: 10px 0; color: #1f1f1f;'>
+                <strong>Correct Answer:</strong> {question.get('correct_answer', 'N/A')}
+                </div>""",
+                unsafe_allow_html=True
+            )
+        with col2:
+            if question.get('explanation'):
+                st.markdown(
+                    f"""<div style='background-color: #fff3e0; padding: 10px; border-radius: 5px; color: #1f1f1f;'>
+                    <strong>Explanation:</strong><br>{question['explanation']}
+                    </div>""",
+                    unsafe_allow_html=True
+                )
+def display_analytics(analytics):
+    """Display analytics visualizations."""
+    st.markdown("""
+        <h2 style='text-align: center; margin-bottom: 40px;'>📊 Analytics Dashboard</h2>
+    """, unsafe_allow_html=True)
+    # Add Fix Button
+    fix_col1, fix_col2 = st.columns([1, 4])
+    with fix_col1:
+        if st.button("🔧 Fix Questions", type="primary"):
+            with st.spinner("Running fix.py..."):
+                result = subprocess.run(['python', 'fix.py'], capture_output=True, text=True)
+                if result.returncode == 0:
+                    st.success("Fix process completed successfully!")
+                else:
+                    st.error(f"Error running fix.py: {result.stderr}")
+    with fix_col2:
+        if analytics.get('unfixed_questions', 0) > 0:
+            st.warning(f"🔍 {analytics['unfixed_questions']} questions need fixing")
+        else:
+            st.success("✅ All questions are fixed")
+    # Key Metrics Overview
+    st.markdown("""
+        <div style='text-align: center; margin-bottom: 30px;'>
+            <h3 style='color: #0f4c81;'>Key Metrics</h3>
+        </div>
+    """, unsafe_allow_html=True)
+    metrics_container = st.container()
+    with metrics_container:
+        col1, col2, col3, col4 = st.columns(4)
+        with col1:
+            st.metric("📚 Total Questions", analytics['total_questions'])
+        with col2:
+            num_domains = len(analytics['questions_by_domain']) if not analytics['questions_by_domain'].empty else 0
+            st.metric("🎯 Number of Domains", num_domains)
+        with col3:
+            if 'has_passage' in analytics:
+                passage_ratio = f"{analytics['passage_ratio']:.1f}%"
+                st.metric("📖 Questions with Passages", passage_ratio)
+        with col4:
+            if 'domain_balance_score' in analytics:
+                balance_score = f"{analytics['domain_balance_score']:.1f}%"
+                st.metric("⚖️ Domain Balance Score", balance_score)
+    # Content Quality Metrics
+    if any(key in analytics for key in ['has_explanation', 'complete_options', 'avg_passage_length']):
+        st.markdown("""
+            <div style='text-align: center; margin: 30px 0;'>
+                <h3 style='color: #0f4c81;'>Content Quality Metrics</h3>
+            </div>
+        """, unsafe_allow_html=True)
+        quality_cols = st.columns(3)
+        with quality_cols[0]:
+            if 'explanation_ratio' in analytics:
+                st.metric("📝 Questions with Explanations",
+                         f"{analytics['explanation_ratio']:.1f}%",
+                         help="Percentage of questions that have explanations")
+        with quality_cols[1]:
+            if 'options_ratio' in analytics:
+                st.metric("✅ Complete Option Sets",
+                         f"{analytics['options_ratio']:.1f}%",
+                         help="Percentage of questions with all 4 options")
+        with quality_cols[2]:
+            if 'avg_passage_length' in analytics:
+                st.metric("📊 Avg Passage Length",
+                         f"{int(analytics['avg_passage_length'])} chars",
+                         help="Average length of reading passages")
+    # Time-based Analytics
+    if 'questions_by_date' in analytics and not analytics['questions_by_date'].empty:
+        st.markdown("""
+            <div style='text-align: center; margin: 30px 0;'>
+                <h3 style='color: #0f4c81;'>Question Generation Timeline</h3>
+            </div>
+        """, unsafe_allow_html=True)
+        # Daily question generation trend
+        fig_timeline = px.line(
+            x=analytics['questions_by_date'].index,
+            y=analytics['questions_by_date'].values,
+            title="Daily Question Generation",
+            labels={'x': 'Date', 'y': 'Number of Questions'}
+        )
+        fig_timeline.update_layout(showlegend=False)
+        st.plotly_chart(fig_timeline, use_container_width=True)
+        # Monthly aggregation
+        if 'questions_by_month' in analytics and not analytics['questions_by_month'].empty:
+            fig_monthly = px.bar(
+                x=analytics['questions_by_month'].index,
+                y=analytics['questions_by_month'].values,
+                title="Monthly Question Generation",
+                labels={'x': 'Month', 'y': 'Number of Questions'}
+            )
+            fig_monthly.update_layout(showlegend=False)
+            st.plotly_chart(fig_monthly, use_container_width=True)
+    # Questions by Exam Type
+    if not analytics['questions_by_exam'].empty:
+        st.markdown("""
+            <div style='text-align: center; margin: 30px 0;'>
+                <h3 style='color: #0f4c81;'>Distribution by Exam Type</h3>
+            </div>
+        """, unsafe_allow_html=True)
+        col1, col2, col3 = st.columns([1,3,1])
+        with col2:
+            fig = px.pie(
+                values=analytics['questions_by_exam'].values,
+                names=analytics['questions_by_exam'].index,
+                hole=0.4,
+                color_discrete_sequence=px.colors.qualitative.Set3
+            )
+            fig.update_layout(
+                showlegend=True,
+                legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="center", x=0.5),
+                margin=dict(t=60, b=40, l=40, r=40)
+            )
+            st.plotly_chart(fig, use_container_width=True)
+    # Questions by Difficulty
+    if not analytics['questions_by_difficulty'].empty:
+        st.markdown("""
+            <div style='text-align: center; margin: 30px 0;'>
+                <h3 style='color: #0f4c81;'>Distribution by Difficulty Level</h3>
+            </div>
+        """, unsafe_allow_html=True)
+        col1, col2, col3 = st.columns([1,3,1])
+        with col2:
+            fig = px.bar(
+                x=analytics['questions_by_difficulty'].index,
+                y=analytics['questions_by_difficulty'].values,
+                color=analytics['questions_by_difficulty'].index,
+                color_discrete_sequence=px.colors.qualitative.Set2
+            )
+            fig.update_layout(
+                showlegend=False,
+                xaxis_title="Difficulty Level",
+                yaxis_title="Number of Questions",
+                margin=dict(t=40, b=40, l=40, r=40)
+            )
+            st.plotly_chart(fig, use_container_width=True)
+    # Domain Coverage Analysis
+    if 'domain_coverage' in analytics and not analytics['domain_coverage'].empty:
+        st.markdown("""
+            <div style='text-align: center; margin: 30px 0;'>
+                <h3 style='color: #0f4c81;'>Domain Coverage Analysis</h3>
+            </div>
+        """, unsafe_allow_html=True)
+        # Domain coverage heatmap
+        fig_coverage = px.bar(
+            analytics['domain_coverage'],
+            x='domain',
+            y='unique_subdomains',
+            title="Number of Unique Subdomains per Domain",
+            color='unique_subdomains',
+            color_continuous_scale='Viridis'
+        )
+        fig_coverage.update_layout(
+            xaxis_title="Domain",
+            yaxis_title="Number of Unique Subdomains",
+            showlegend=False
+        )
+        st.plotly_chart(fig_coverage, use_container_width=True)
+    # Questions by Domain and Subdomain
+    if not analytics['questions_by_subdomain'].empty and len(analytics['questions_by_subdomain']) > 0:
+        st.markdown("""
+            <div style='text-align: center; margin: 30px 0;'>
+                <h3 style='color: #0f4c81;'>Distribution by Domain and Subdomain</h3>
+            </div>
+        """, unsafe_allow_html=True)
+        fig = px.treemap(
+            analytics['questions_by_subdomain'],
+            path=['exam_type', 'domain', 'subdomain'],
+            values='count',
+            color='count',
+            color_continuous_scale='Viridis'
+        )
+        fig.update_layout(margin=dict(t=30, b=30, l=30, r=30))
+        fig.update_traces(textinfo="label+value")
+        st.plotly_chart(fig, use_container_width=True)
+    # Recent Activity
+    if 'recent_activity' in analytics and not analytics['recent_activity'].empty:
+        st.markdown("""
+            <div style='text-align: center; margin: 30px 0;'>
+                <h3 style='color: #0f4c81;'>Recent Activity</h3>
+            </div>
+        """, unsafe_allow_html=True)
+        recent_df = analytics['recent_activity']
+        st.dataframe(
+            recent_df[['exam_type', 'domain', 'subdomain', 'difficulty_level', 'created_at']],
+            hide_index=True,
+            column_config={
+                'created_at': 'Timestamp',
+                'exam_type': 'Exam Type',
+                'domain': 'Domain',
+                'subdomain': 'Subdomain',
+                'difficulty_level': 'Difficulty'
+            }
+        )
+    # Add some spacing at the bottom
+    st.markdown("<br><br>", unsafe_allow_html=True)
+def get_unique_domains():
+    """Get unique domains from the database."""
+    domains = {
+        "SAT": ["Mathematics", "Reading and Writing"],
+        "IELTS": ["Reading", "Writing", "Speaking", "Listening"],
+        "TOEFL": ["Reading", "Listening", "Speaking", "Writing"]
+    }
+    return domains
+def get_subdomains_for_domain(exam_type, domain):
+    """Get subdomains for a specific domain."""
+    subdomains = {
+        "SAT": {
+            "Mathematics": [
+                "Algebra",
+                "Advanced Mathematics",
+                "Problem Solving and Data Analysis",
+                "Geometry and Trigonometry"
+            ],
+            "Reading and Writing": [
+                "Information and Ideas",
+                "Craft and Structure"
+            ]
+        },
+        "IELTS": {
+            "Reading": [
+                "Information Location",
+                "Critical Analysis",
+                "Vocabulary and Reference"
+            ],
+            "Writing": [
+                "Task Analysis",
+                "Essay Development",
+                "Language Control"
+            ],
+            "Speaking": [
+                "Personal Expression",
+                "Topic Development",
+                "Communication Skills"
+            ],
+            "Listening": [
+                "Academic Understanding",
+                "Pragmatic Understanding",
+                "Connecting Information"
+            ]
+        },
+        "TOEFL": {
+            "Reading": [
+                "Comprehension",
+                "Analysis",
+                "Academic Skills"
+            ],
+            "Listening": [
+                "Academic Understanding",
+                "Pragmatic Understanding",
+                "Connecting Information"
+            ],
+            "Speaking": [
+                "Independent Tasks",
+                "Integrated Tasks",
+                "Delivery Skills"
+            ],
+            "Writing": [
+                "Independent Writing",
+                "Integrated Writing",
+                "Language Control"
+            ]
+        }
+    }
+    return subdomains.get(exam_type, {}).get(domain, [])
+def get_topics_for_subdomain(exam_type, domain, subdomain):
+    """Get topics for a specific subdomain."""
+    topics = {
+        "SAT": {
+            "Reading and Writing": {
+                "Information and Ideas": [
+                    "Central Ideas and Details",
+                    "Command of Textual Evidence",
+                    "Command of Quantitative Evidence",
+                    "Inferences",
+                    "Words in Context"
+                ],
+                "Craft and Structure": [
+                    "Text Structure and Purpose",
+                    "Cross-Text Connections",
+                    "Rhetorical Synthesis",
+                    "Boundaries",
+                    "Transitions"
+                ]
+            },
+            "Mathematics": {
+                "Algebra": [
+                    "Linear equations in one variable",
+                    "Linear equations in two variables",
+                    "Linear functions",
+                    "Systems of two linear equations in two variables",
+                    "Linear inequalities in one or two variables"
+                ],
+                "Advanced Mathematics": [
+                    "Equivalent expressions",
+                    "Nonlinear equations in one variable and systems of equations in two variables",
+                    "Nonlinear functions"
+                ],
+                "Problem Solving and Data Analysis": [
+                    "Ratios, rates, proportional relationships, and units",
+                    "Percentages",
+                    "One-variable data: distributions and measures of center and spread",
+                    "Two-variable data: models and scatterplots",
+                    "Probability and conditional probability",
+                    "Inference from sample statistics and margin of error",
+                    "Evaluating statistical claims: observational studies and experiments"
+                ],
+                "Geometry and Trigonometry": [
+                    "Area and volume",
+                    "Lines, angles, and triangles",
+                    "Right triangles and trigonometry",
+                    "Circles"
+                ]
+            }
+        }
+        # Add IELTS and TOEFL topics here if needed
+    }
+    return topics.get(exam_type, {}).get(domain, {}).get(subdomain, [])
+# Streamlit Interface
+st.set_page_config(page_title="📄 PDF to Exam Questions Generator with Supabase Upload", layout="wide")
+st.title("📄 PDF to Exam Questions Generator with Supabase Upload")
+# Create tabs for different functionalities
+tab_upload, tab_view, tab_analytics = st.tabs(["📤 Upload & Generate", "🔍 View Questions", "📊 Analytics"])
+with tab_upload:
+    st.markdown(
+        """
+        Upload PDF files containing exam material, select the exam type, and generate structured questions automatically.
+        The generated questions will be uploaded to your Supabase database.
+        **Supported Exam Types**: SAT, IELTS, TOEFL
+        """
+    )
+    # File uploader and exam type selection
+    uploaded_files = st.file_uploader("📥 Upload PDFs", type=["pdf"], accept_multiple_files=True)
+    exam_type = st.selectbox(
+        "📝 Select Exam Type",
+        options=["SAT", "IELTS", "TOEFL"],
+        index=0
+    )
+    # Generate and Upload Button
+    if st.button("🚀 Generate and Upload Questions"):
+        if not uploaded_files:
+            st.error("Please upload at least one PDF file.")
+        else:
+            with st.spinner("Processing files..."):
+                questions_json, download_content = process_pdfs(uploaded_files, exam_type)
+                if questions_json:
+                    st.success(f"Successfully processed {len(uploaded_files)} files and generated questions!")
+                    st.json(json.loads(questions_json))
+                    # Provide download button
+                    st.download_button(
+                        label="⬇️ Download Questions JSON",
+                        data=download_content,
+                        file_name=f"generated_questions_{uuid.uuid4()}.json",
+                        mime="application/json"
+                    )
+with tab_view:
+    st.subheader("Question Browser")
+    # Initialize session state
+    if 'selected_domain' not in st.session_state:
+        st.session_state.selected_domain = "All"
+    if 'selected_subdomain' not in st.session_state:
+        st.session_state.selected_subdomain = "All"
+    if 'selected_topic' not in st.session_state:
+        st.session_state.selected_topic = "All"
+    # Filters
+    col1, col2 = st.columns(2)
+    with col1:
+        view_exam_type = st.selectbox("Exam Type", ["All"] + EXAM_TYPES, key="view_exam_type")
+        # Get domains based on exam type
+        domains = ["All"]
+        if view_exam_type != "All":
+            domains.extend(get_unique_domains().get(view_exam_type, []))
+        domain = st.selectbox("Domain", domains, key="domain_select")
+        # Reset subdomain when domain changes
+        if domain != st.session_state.get('last_domain'):
+            st.session_state.selected_subdomain = "All"
+            st.session_state.last_domain = domain
+            st.session_state.selected_topic = "All"
+    with col2:
+        difficulty = st.selectbox("Difficulty Level", ["All"] + DIFFICULTY_LEVELS)
+        # Get subdomains based on selected exam type and domain
+        subdomains = ["All"]
+        if domain != "All" and view_exam_type != "All":
+            subdomains.extend(get_subdomains_for_domain(view_exam_type, domain))
+        subdomain = st.selectbox("Subdomain", subdomains, key="subdomain_select")
+        # Get topics based on selected exam type, domain, and subdomain
+        topics = ["All"]
+        if subdomain != "All" and domain != "All" and view_exam_type != "All":
+            topics.extend(get_topics_for_subdomain(view_exam_type, domain, subdomain))
+        topic = st.selectbox("Topic", topics, key="topic_select")
+    # Apply filters
+    filters = {
+        'exam_type': view_exam_type if view_exam_type != "All" else None,
+        'difficulty_level': difficulty if difficulty != "All" else None,
+        'domain': domain if domain != "All" else None,
+        'subdomain': subdomain if subdomain != "All" else None,
+        'topic': topic if topic != "All" else None
+    }
+    # Remove None values from filters
+    filters = {k: v for k, v in filters.items() if v is not None}
+    # Get filtered questions
+    questions = get_questions(filters)
+    if not questions:
+        st.info("No questions found matching the selected filters.")
+    else:
+        st.success(f"Found {len(questions)} questions")
+        # Display questions
+        for i, question in enumerate(questions):
+            display_question(question, i)
+with tab_analytics:
+    # Get all questions for analytics
+    all_questions = get_questions()
+    analytics = get_analytics_data(all_questions)
+    display_analytics(analytics)
+st.markdown(
+    """
+    ---
+    **Note**: This application uses Azure OpenAI services to generate exam questions and uploads them to Supabase. Ensure that your API credentials are correctly set in the environment variables.
+    """
+)

fix.py ADDED Viewed

	@@ -0,0 +1,349 @@

+# fix.py
+import os
+import json
+import logging
+import re
+from typing import Dict, Any, Optional
+from io import BytesIO
+import concurrent.futures
+from threading import Lock
+import queue
+import openai
+from supabase import create_client, Client
+from dotenv import load_dotenv
+from tqdm import tqdm  # For progress bar
+from openai import AzureOpenAI
+# Set up logging with thread safety
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(threadName)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler('fix.log'),
+        logging.StreamHandler()
+    ]
+)
+# Load environment variables from .env file (if present)
+load_dotenv()
+# Constants
+MIN_PASSAGE_WORDS = 100  # Minimum number of words for reading_passage
+VALID_CORRECT_ANSWERS = {'A', 'B', 'C', 'D'}
+EXAM_TYPES = ["SAT", "IELTS", "TOEFL"]
+# Load environment variables
+SUPABASE_URL = os.getenv("SUPABASE_DB_URL")
+SUPABASE_API_KEY = os.getenv("SUPABASE_API_KEY")
+AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY")
+AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
+AZURE_OPENAI_DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")
+AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "2023-05-15")
+# Validate environment variables
+missing_vars = []
+if not SUPABASE_URL:
+    missing_vars.append("SUPABASE_DB_URL")
+if not SUPABASE_API_KEY:
+    missing_vars.append("SUPABASE_API_KEY")
+if not AZURE_OPENAI_KEY:
+    missing_vars.append("AZURE_OPENAI_KEY")
+if not AZURE_OPENAI_ENDPOINT:
+    missing_vars.append("AZURE_OPENAI_ENDPOINT")
+if not AZURE_OPENAI_DEPLOYMENT_NAME:
+    missing_vars.append("AZURE_OPENAI_DEPLOYMENT_NAME")
+if missing_vars:
+    logging.error(f"Missing environment variables: {', '.join(missing_vars)}")
+    raise EnvironmentError(f"Missing environment variables: {', '.join(missing_vars)}")
+# Initialize Supabase client
+supabase: Client = create_client(SUPABASE_URL, SUPABASE_API_KEY)
+logging.info("Connected to Supabase successfully.")
+# Initialize OpenAI for Azure
+openai.api_type = "azure"
+openai.api_key = AZURE_OPENAI_KEY
+openai.api_base = AZURE_OPENAI_ENDPOINT
+openai.api_version = AZURE_OPENAI_API_VERSION
+# Set up Azure OpenAI client
+API_KEY = os.getenv("AZURE_OPENAI_KEY")
+ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
+DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")
+if not API_KEY or not ENDPOINT or not DEPLOYMENT_NAME:
+    raise ValueError("Azure OpenAI configuration is incomplete.")
+client = AzureOpenAI(
+    api_key=API_KEY,
+    api_version="2024-02-15-preview",
+    azure_endpoint=ENDPOINT
+)
+# Thread-safe counter for progress tracking
+class AtomicCounter:
+    def __init__(self, initial=0):
+        self._value = initial
+        self._lock = Lock()
+    def increment(self):
+        with self._lock:
+            self._value += 1
+            return self._value
+    def value(self):
+        with self._lock:
+            return self._value
+def word_count(text: str) -> int:
+    """Returns the number of words in a given text."""
+    return len(text.split())
+def is_valid_correct_answer(answer: str) -> bool:
+    """Checks if the correct_answer is one of A, B, C, D."""
+    return answer.upper() in VALID_CORRECT_ANSWERS
+def clean_text(text: str) -> str:
+    """Cleans the text by removing unwanted characters and extra spaces."""
+    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with single space
+    text = text.strip()
+    return text
+def check_row_quality(row: Dict[str, Any]) -> bool:
+    """
+    Checks if the row has good quality data according to exam standards.
+    Returns True if the row is good, False if it needs fixing.
+    """
+    # Skip if already fixed
+    if row.get('is_fixed'):
+        return True
+    required_fields = [
+        'exam_type', 'content_type', 'exam_section', 'domain', 'subdomain',
+        'topic', 'difficulty_level', 'reading_passage', 'question_text',
+        'option_a', 'option_b', 'option_c', 'option_d', 'correct_answer',
+        'explanation'
+    ]
+    # Check for missing or empty required fields
+    for field in required_fields:
+        if not row.get(field):
+            return False
+    # Check for OCR artifacts in text fields
+    text_fields = ['reading_passage', 'question_text', 'option_a', 'option_b', 'option_c', 'option_d', 'explanation']
+    for field in text_fields:
+        text = row.get(field, '')
+        if isinstance(text, str):
+            if 'arebasedonthe' in text or text.count('.') > 20 or 'Line' in text:
+                return False
+    return True
+def generate_fixed_content(row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+    """
+    Uses Azure OpenAI to generate fixed content for a row.
+    Returns a dictionary with fixed fields or None if failed.
+    """
+    prompt = f"""Fix and improve the following exam question. Clean up any OCR artifacts, fix formatting issues, and ensure high quality.
+Current Question:
+Reading Passage: {row.get('reading_passage', '')}
+Question: {row.get('question_text', '')}
+Options:
+A) {row.get('option_a', '')}
+B) {row.get('option_b', '')}
+C) {row.get('option_c', '')}
+D) {row.get('option_d', '')}
+Correct Answer: {row.get('correct_answer', '')}
+Explanation: {row.get('explanation', '')}
+Requirements:
+1. Clean up any OCR artifacts and formatting issues
+2. Maintain the same meaning and difficulty level
+3. Keep the same correct answer
+4. Ensure the explanation clearly justifies the answer
+5. Make sure all text is properly formatted and readable
+6. Preserve all important content and details
+7. Fix any spacing or punctuation issues
+Return a JSON object with the following fields:
+{{
+    "reading_passage": "cleaned passage",
+    "question_text": "cleaned question",
+    "option_a": "cleaned option A",
+    "option_b": "cleaned option B",
+    "option_c": "cleaned option C",
+    "option_d": "cleaned option D",
+    "explanation": "cleaned explanation"
+}}"""
+    try:
+        response = client.chat.completions.create(
+            model=DEPLOYMENT_NAME,
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are an expert at fixing and improving exam questions. Clean up formatting while preserving meaning."
+                },
+                {"role": "user", "content": prompt}
+            ],
+            response_format={"type": "json_object"},
+            temperature=0.0
+        )
+        fixed_content = json.loads(response.choices[0].message.content)
+        # Preserve original fields and update only the fixed ones
+        updated_data = row.copy()
+        updated_data.update(fixed_content)
+        updated_data['is_fixed'] = True
+        return updated_data
+    except Exception as e:
+        logging.error(f"Error generating fixed content: {str(e)}")
+        return None
+def extract_json(text: str) -> Optional[str]:
+    """
+    Extracts JSON object from a block of text.
+    Returns the JSON string or None if not found.
+    """
+    try:
+        # Find the first { and the last }
+        start = text.find('{')
+        end = text.rfind('}')
+        if start == -1 or end == -1:
+            return None
+        json_str = text[start:end+1]
+        # Validate JSON
+        json.loads(json_str)
+        return json_str
+    except json.JSONDecodeError:
+        return None
+def update_row_in_supabase(row_id: str, fixed_data: Dict[str, Any]) -> bool:
+    """
+    Updates a row in Supabase with fixed data.
+    Returns True if successful, False otherwise.
+    """
+    try:
+        response = supabase.table("exam_contents").update(fixed_data).eq("id", row_id).execute()
+        # Check if data exists in the response
+        if response.data:
+            logging.info(f"Successfully updated row ID {row_id}.")
+            return True
+        else:
+            logging.error(f"Failed to update row ID {row_id}.")
+            return False
+    except Exception as e:
+        logging.error(f"Exception while updating row ID {row_id}: {str(e)}")
+        return False
+def process_row(row: Dict[str, Any], progress_counter: AtomicCounter, total_rows: int) -> Dict[str, Any]:
+    """
+    Process a single row with progress tracking.
+    Returns a dictionary with the results.
+    """
+    row_id = row.get('id')
+    result = {
+        'row_id': row_id,
+        'success': False,
+        'message': ''
+    }
+    try:
+        if not row_id:
+            result['message'] = "Row without ID found"
+            return result
+        if check_row_quality(row):
+            success = update_row_in_supabase(row_id, {'is_fixed': True})
+            result['success'] = success
+            result['message'] = "Good quality, marked as fixed"
+            progress_counter.increment()
+            return result
+        fixed_data = generate_fixed_content(row)
+        if not fixed_data:
+            result['message'] = "Failed to fix content"
+            progress_counter.increment()
+            return result
+        success = update_row_in_supabase(row_id, fixed_data)
+        result['success'] = success
+        result['message'] = "Successfully fixed and updated" if success else "Failed to update"
+    except Exception as e:
+        result['message'] = f"Error: {str(e)}"
+        logging.error(f"Error processing row {row_id}: {str(e)}")
+    progress_counter.increment()
+    progress = progress_counter.value()
+    if progress % 10 == 0:  # Update progress every 10 rows
+        print(f"Progress: {progress}/{total_rows} rows processed")
+    return result
+def main():
+    """
+    Main function to process and fix exam questions in Supabase using multithreading.
+    """
+    logging.info("Starting fix.py script with multithreading.")
+    try:
+        # Fetch only unfixed rows from exam_contents
+        response = supabase.table("exam_contents").select("*").eq("is_fixed", False).execute()
+        rows = response.data
+        total_rows = len(rows)
+        logging.info(f"Fetched {total_rows} unfixed rows from exam_contents.")
+        if total_rows == 0:
+            logging.info("No unfixed rows found in exam_contents. Exiting.")
+            print("No unfixed rows found in exam_contents. Exiting.")
+            return
+        # Initialize counters
+        progress_counter = AtomicCounter()
+        success_count = 0
+        failure_count = 0
+        # Create a thread pool
+        max_workers = min(32, total_rows)  # Cap at 32 threads or total rows, whichever is smaller
+        print(f"Starting processing with {max_workers} threads...")
+        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+            # Submit all rows for processing
+            future_to_row = {
+                executor.submit(process_row, row, progress_counter, total_rows): row
+                for row in rows
+            }
+            # Process completed futures as they finish
+            for future in concurrent.futures.as_completed(future_to_row):
+                result = future.result()
+                if result['success']:
+                    success_count += 1
+                else:
+                    failure_count += 1
+                    logging.warning(f"Failed to process row {result['row_id']}: {result['message']}")
+        # Final statistics
+        logging.info(f"Processing completed. Success: {success_count}, Failures: {failure_count}")
+        print(f"\nProcessing completed:")
+        print(f"Total rows processed: {total_rows}")
+        print(f"Successful updates: {success_count}")
+        print(f"Failed updates: {failure_count}")
+    except Exception as e:
+        logging.error(f"An unexpected error occurred: {str(e)}")
+        print(f"An unexpected error occurred: {str(e)}")
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+streamlit
+openai
+python-dotenv
+pydantic
+supabase
+PyMuPDF
+plotly
+pandas
+tiktoken