Spaces:
Sleeping
Sleeping
import json | |
import logging | |
import os | |
import re | |
import subprocess | |
import uuid | |
from concurrent.futures import ThreadPoolExecutor, as_completed | |
from io import BytesIO | |
from queue import Queue | |
from typing import Any, Dict, List, Optional | |
import pandas as pd | |
import plotly.express as px | |
import PyPDF2 | |
import streamlit as st | |
from dotenv import load_dotenv | |
from openai import OpenAI | |
from pydantic import BaseModel | |
from supabase import Client, create_client | |
# Set page config - MUST be the first Streamlit command | |
st.set_page_config(page_title="π PDF to Exam Questions Generator with Supabase Upload", layout="wide") | |
# Load environment variables from .env file (if present) | |
load_dotenv() | |
# Check for required environment variables | |
required_env_vars = { | |
"OPENAI_API_KEY": os.getenv("OPENAI_API_KEY"), | |
"SUPABASE_DB_URL": os.getenv("SUPABASE_DB_URL"), | |
"SUPABASE_API_KEY": os.getenv("SUPABASE_API_KEY") | |
} | |
missing_vars = [var for var, value in required_env_vars.items() if not value] | |
if missing_vars: | |
st.error(f"Missing required environment variables: {', '.join(missing_vars)}") | |
st.stop() | |
# Set up logging | |
class StringListHandler(logging.Handler): | |
def __init__(self): | |
super().__init__() | |
self.logs = [] | |
def emit(self, record): | |
self.logs.append(self.format(record)) | |
def get_logs(self): | |
return "\n".join(self.logs) | |
def clear(self): | |
self.logs = [] | |
# Set up logging with our custom handler | |
log_handler = StringListHandler() | |
log_handler.setFormatter(logging.Formatter('%(levelname)s: %(message)s')) | |
logging.getLogger().addHandler(log_handler) | |
logging.getLogger().setLevel(logging.INFO) | |
# Add a filter to suppress HTTP request logging from Supabase and related libraries. | |
class HttpRequestFilter(logging.Filter): | |
def filter(self, record): | |
if "HTTP Request:" in record.getMessage(): | |
return False | |
return True | |
log_handler.addFilter(HttpRequestFilter()) | |
logging.getLogger("httpx").setLevel(logging.WARNING) | |
logging.getLogger("urllib3").setLevel(logging.WARNING) | |
# Load environment variables from .env file (if present) | |
load_dotenv() | |
# Constants | |
EXAM_TYPES = ["SAT", "IELTS", "TOEFL"] | |
DIFFICULTY_LEVELS = ["Easy", "Medium", "Hard", "Very Hard"] | |
REQUIRED_FIELDS = [ | |
"exam_type", "content_type", "exam_section", "domain", "subdomain", | |
"topic", "difficulty_level", "reading_passage", "question_text", | |
"option_a", "option_b", "option_c", "option_d", "correct_answer", | |
"explanation", "is_active", "source_text" | |
] | |
class ExamQuestion(BaseModel): | |
exam_type: str | |
content_type: str = "Generated" | |
exam_section: str | |
domain: str | |
subdomain: str | |
topic: str | |
difficulty_level: str = "Medium" | |
reading_passage: str | |
reading_passage_title: Optional[str] = None | |
question_text: str | |
option_a: str | |
option_b: str | |
option_c: str | |
option_d: str | |
correct_answer: str | |
explanation: str | |
source_text: str # The original text from which the question was generated | |
is_active: bool = True | |
class ExamQuestionResponse(BaseModel): | |
questions: List[ExamQuestion] | |
# Set up OpenAI client | |
try: | |
client = OpenAI(api_key=required_env_vars["OPENAI_API_KEY"]) | |
logging.info("OpenAI client initialized successfully.") | |
except Exception as e: | |
logging.error(f"Failed to initialize OpenAI client: {e}") | |
st.error(f"Failed to initialize OpenAI client: {str(e)}") | |
# Set up Supabase client | |
SUPABASE_URL = required_env_vars["SUPABASE_DB_URL"] | |
SUPABASE_API_KEY = required_env_vars["SUPABASE_API_KEY"] | |
supabase: Client = create_client(SUPABASE_URL, SUPABASE_API_KEY) | |
# Create a thread-safe queue for logging | |
log_queue = Queue() | |
def safe_st_warning(message: str): | |
"""Thread-safe way to queue warning messages""" | |
log_queue.put(("warning", message)) | |
def safe_st_error(message: str): | |
"""Thread-safe way to queue error messages""" | |
log_queue.put(("error", message)) | |
# Define the domain structures | |
domain_structures = { | |
"SAT": """SAT Domains and Subdomains: | |
1. Reading and Writing: | |
- Craft and Structure: | |
* Words in Context | |
* Text Structure and Purpose | |
* Cross-Text Connections | |
- Information and Ideas: | |
* Central Ideas and Details | |
* Command of Textual Evidence | |
* Command of Quantitative Evidence | |
* Inferences | |
- Standard English Conventions: | |
* Boundaries | |
* Form, Structure, and Sense | |
- Expression of Ideas: | |
* Transitions | |
* Rhetorical Synthesis | |
2. Mathematics: | |
- Algebra: | |
* Linear equations in one variable | |
* Linear equations in two variables | |
* Linear functions | |
* Systems of two linear equations in two variables | |
* Linear inequalities in one or two variables | |
- Advanced Mathematics: | |
* Equivalent expressions | |
* Nonlinear equations in one variable and systems of equations in two variables | |
* Nonlinear functions | |
- Problem Solving and Data Analysis: | |
* Ratios, rates, proportional relationships, and units | |
* Percentages | |
* One-variable data: distributions and measures of center and spread | |
* Two-variable data: models and scatterplots | |
* Probability and conditional probability | |
* Inference from sample statistics and margin of error | |
* Evaluating statistical claims: observational studies and experiments | |
- Geometry and Trigonometry: | |
* Area and volume | |
* Lines, angles, and triangles | |
* Right triangles and trigonometry | |
* Circles""", | |
"IELTS": """IELTS Domains and Subdomains: | |
1. Reading: | |
- Information Location: | |
* Scanning for Details | |
* Skimming for Main Ideas | |
* Locating Specific Information | |
* Finding Supporting Evidence | |
- Critical Analysis: | |
* Author's Purpose | |
* Text Organization | |
* Opinion and Attitude | |
* Argument Analysis | |
- Vocabulary and Reference: | |
* Word Meaning in Context | |
* Reference Words | |
* Paraphrase Recognition | |
* Academic Vocabulary | |
2. Writing: | |
- Task Analysis: | |
* Data Interpretation | |
* Process Description | |
* Compare and Contrast | |
* Problem and Solution | |
- Essay Development: | |
* Argument Construction | |
* Evidence Support | |
* Coherence and Cohesion | |
* Academic Style | |
- Language Control: | |
* Grammar Range | |
* Vocabulary Usage | |
* Sentence Structure | |
* Punctuation | |
3. Speaking: | |
- Personal Expression: | |
* Self Introduction | |
* Personal Experience | |
* Opinion Expression | |
* Future Plans | |
- Topic Development: | |
* Extended Discourse | |
* Topic Analysis | |
* Example Provision | |
* Abstract Discussion | |
- Communication Skills: | |
* Fluency and Coherence | |
* Pronunciation | |
* Interactive Communication | |
* Response Relevance | |
4. Listening: | |
- Academic Understanding: | |
* Lecture Comprehension | |
* Discussion Analysis | |
* Main Points Identification | |
* Detail Recognition | |
- Pragmatic Understanding: | |
* Speaker Attitude | |
* Function of Utterances | |
* Degree of Certainty | |
* Speaker Relationship | |
- Connecting Information: | |
* Information Organization | |
* Connecting Content | |
* Understanding Examples | |
* Making Inferences | |
5. Speaking: | |
- Independent Tasks: | |
* Opinion Expression | |
* Personal Experience | |
* Preference Justification | |
* Choice Explanation | |
- Integrated Tasks: | |
* Lecture Summary | |
* Reading-Listening Integration | |
* Campus Situation Response | |
* Academic Topic Discussion | |
- Delivery Skills: | |
* Pronunciation | |
* Intonation | |
* Rhythm and Pacing | |
* Natural Flow | |
6. Writing: | |
- Independent Writing: | |
* Essay Organization | |
* Thesis Development | |
* Evidence Support | |
* Conclusion Writing | |
- Integrated Writing: | |
* Source Integration | |
* Information Synthesis | |
* Accurate Reporting | |
* Response Organization | |
- Language Control: | |
* Grammar Accuracy | |
* Vocabulary Range | |
* Sentence Variety | |
* Academic Style""", | |
"TOEFL": """TOEFL Domains and Subdomains: | |
1. Reading: | |
- Comprehension: | |
* Main Idea and Details | |
* Inference Making | |
* Author's Purpose | |
* Vocabulary in Context | |
- Analysis: | |
* Text Organization | |
* Information Integration | |
* Argument Evaluation | |
* Evidence Assessment | |
- Academic Skills: | |
* Paraphrase Recognition | |
* Summary Skills | |
* Table Completion | |
* Classification | |
2. Listening: | |
- Academic Understanding: | |
* Lecture Comprehension | |
* Discussion Analysis | |
* Main Points Identification | |
* Detail Recognition | |
- Pragmatic Understanding: | |
* Speaker Attitude | |
* Function of Utterances | |
* Degree of Certainty | |
* Speaker Relationship | |
- Connecting Information: | |
* Information Organization | |
* Connecting Content | |
* Understanding Examples | |
* Making Inferences | |
3. Speaking: | |
- Independent Tasks: | |
* Opinion Expression | |
* Personal Experience | |
* Preference Justification | |
* Choice Explanation | |
- Integrated Tasks: | |
* Lecture Summary | |
* Reading-Listening Integration | |
* Campus Situation Response | |
* Academic Topic Discussion | |
- Delivery Skills: | |
* Pronunciation | |
* Intonation | |
* Rhythm and Pacing | |
* Natural Flow | |
4. Writing: | |
- Independent Writing: | |
* Essay Organization | |
* Thesis Development | |
* Evidence Support | |
* Conclusion Writing | |
- Integrated Writing: | |
* Source Integration | |
* Information Synthesis | |
* Accurate Reporting | |
* Response Organization | |
- Language Control: | |
* Grammar Accuracy | |
* Vocabulary Range | |
* Sentence Variety | |
* Academic Style""" | |
} | |
def extract_text_from_pdf(pdf_file) -> str: | |
""" | |
Extracts all text from a PDF file and returns it as a single string. | |
""" | |
try: | |
# Convert to BytesIO if needed | |
if isinstance(pdf_file, (str, bytes)): | |
pdf_file = BytesIO(pdf_file) | |
# Seek to beginning of file to ensure we can read it | |
pdf_file.seek(0) | |
reader = PyPDF2.PdfReader(pdf_file) | |
text = "" | |
logging.info(f"Processing PDF with {len(reader.pages)} pages") | |
# Extract text from all pages | |
for page_num in range(len(reader.pages)): | |
try: | |
page = reader.pages[page_num] | |
page_text = page.extract_text() | |
if page_text: | |
text += f"\n--- Page {page_num + 1} ---\n{page_text}\n" | |
logging.info(f"Successfully extracted text from page {page_num + 1}") | |
else: | |
logging.warning(f"No text extracted from page {page_num + 1}") | |
except Exception as e: | |
logging.error(f"Error processing page {page_num + 1}: {str(e)}") | |
continue | |
if not text.strip(): | |
logging.error("No text was extracted from any page of the PDF") | |
return "" | |
logging.info(f"Successfully extracted {len(text)} characters of text") | |
# Log a preview of the extracted text | |
preview = text[:500] + "..." if len(text) > 500 else text | |
logging.info(f"Text preview:\n{preview}") | |
return text | |
except Exception as e: | |
logging.error(f"Error extracting text from PDF: {str(e)}") | |
return "" | |
def clean_json_string(text: str) -> str: | |
""" | |
Clean and extract JSON from the response text. | |
Handles both array and object responses, ensuring the output is in {"questions": [...]} format. | |
""" | |
try: | |
# First try to parse the text directly | |
parsed = json.loads(text) | |
# If it's an array, wrap it in a questions object | |
if isinstance(parsed, list): | |
return json.dumps({"questions": parsed}) | |
# If it's an object with questions, return as is | |
if isinstance(parsed, dict) and "questions" in parsed: | |
return text | |
# If it's an object but missing questions array, wrap it | |
if isinstance(parsed, dict): | |
return json.dumps({"questions": [parsed]}) | |
raise ValueError("Invalid JSON structure") | |
except json.JSONDecodeError: | |
# If direct parsing fails, try to clean and extract JSON | |
try: | |
# Remove any markdown code block syntax | |
text = re.sub(r'```json\s*|\s*```', '', text) | |
# Find JSON-like structure | |
json_match = re.search(r'(\{[\s\S]*\}|\[[\s\S]*\])', text) | |
if json_match: | |
potential_json = json_match.group(0) | |
# Clean up common issues | |
potential_json = re.sub(r',(\s*[\}\]])', r'\1', potential_json) # Remove trailing commas | |
potential_json = re.sub(r'\\n', ' ', potential_json) # Replace newlines | |
potential_json = re.sub(r'\\([^"])', r'\1', potential_json) # Remove invalid escapes | |
# Parse and validate the cleaned JSON | |
parsed = json.loads(potential_json) | |
# Handle different formats | |
if isinstance(parsed, list): | |
return json.dumps({"questions": parsed}) | |
elif isinstance(parsed, dict) and "questions" in parsed: | |
return json.dumps(parsed) | |
elif isinstance(parsed, dict): | |
return json.dumps({"questions": [parsed]}) | |
else: | |
raise ValueError("Invalid JSON structure") | |
except (json.JSONDecodeError, AttributeError): | |
pass | |
# If all cleaning attempts fail, raise an error | |
raise ValueError("Could not extract valid JSON from response") | |
def process_text(text: str, exam_type: str, structure: str, source_file: str) -> (List[Dict[str, Any]], float): | |
""" | |
Process the entire text by extracting and formatting existing questions. | |
""" | |
# Create a container for this chunk's processing | |
with st.expander(f"π Processing Text Chunk", expanded=True): | |
col1, col2 = st.columns([3, 1]) | |
with col1: | |
status = st.empty() | |
status.info("π€ Sending request to AI...") | |
with col2: | |
progress = st.progress(0) | |
prompt = f"""You are a question extractor. Your task is to extract and format EVERY SINGLE question from the provided text into a complete JSON array. | |
ABSOLUTELY CRITICAL: | |
1. You MUST write out EVERY SINGLE question in full - no exceptions | |
2. DO NOT use any comments like "Additional questions would follow" | |
3. DO NOT add any notes or explanations outside the JSON | |
4. DO NOT use any placeholders or summaries | |
5. DO NOT mention "subset of questions" or similar | |
6. If there are 50 questions in the text, your JSON must contain exactly 50 complete question objects | |
7. If you hit a length limit, stop at the last complete question you can include | |
8. The response should be PURE JSON - nothing else | |
9. SKIP ANY QUESTIONS that refer to images, diagrams, graphs, figures, or visual elements | |
10. If a question mentions "look at the image", "in the picture", "as shown in", etc., DO NOT include it | |
11. For each question, include the specific source text that the question is based on | |
Text to process: | |
{text} | |
Domain Structure: | |
{structure} | |
Format EVERY question using this exact JSON structure: | |
{{ | |
"exam_type": "{exam_type}", | |
"content_type": "Generated", | |
"exam_section": "{exam_type.lower()}", | |
"domain": "domain_from_structure", | |
"subdomain": "subdomain_from_structure", | |
"topic": "topic_from_structure", | |
"difficulty_level": "one_of[Easy,Medium,Hard,Very Hard]", | |
"reading_passage": "exact_passage_from_text", | |
"reading_passage_title": "title_from_text_or_generate_appropriate_title", | |
"question_text": "exact_question_from_text", | |
"option_a": "exact_option_a_from_text", | |
"option_b": "exact_option_b_from_text", | |
"option_c": "exact_option_c_from_text", | |
"option_d": "exact_option_d_from_text", | |
"correct_answer": "one_of[A,B,C,D]_determined_from_context", | |
"explanation": "explanation_from_text_or_generate_based_on_answer", | |
"source_text": "exact_text_snippet_that_this_question_is_based_on", | |
"is_active": true | |
}}""" | |
try: | |
logging.info("Sending request to OpenAI...") | |
progress.progress(25) | |
status.info("π€ Generating questions...") | |
response = client.chat.completions.create( | |
model="o3-mini", | |
messages=[ | |
{"role": "user", "content": prompt} | |
], | |
response_format={"type": "json_object"} # Request JSON response format | |
) | |
content = response.choices[0].message.content.strip() | |
logging.info(f"Received response of length: {len(content)} characters") | |
progress.progress(50) | |
status.info("β¨ Processing AI response...") | |
# Log the first 200 characters of the response for debugging | |
logging.info(f"Response preview: {content[:200]}...") | |
# Clean and validate JSON | |
try: | |
logging.info("Attempting to clean and parse JSON...") | |
content = clean_json_string(content) | |
parsed_data = json.loads(content) | |
progress.progress(75) | |
if not isinstance(parsed_data, dict) or 'questions' not in parsed_data: | |
error_msg = "Response missing 'questions' array" | |
logging.error(error_msg) | |
status.error(error_msg) | |
raise ValueError(error_msg) | |
questions = parsed_data['questions'] | |
if not isinstance(questions, list): | |
error_msg = "'questions' is not an array" | |
logging.error(error_msg) | |
status.error(error_msg) | |
raise ValueError(error_msg) | |
logging.info(f"Found {len(questions)} questions in response") | |
status.success(f"π Found {len(questions)} questions") | |
# Validate questions | |
valid_questions = [] | |
invalid_count = 0 | |
# Create a validation progress bar | |
validation_progress = st.progress(0) | |
validation_status = st.empty() | |
validation_status.info("π Validating questions...") | |
# Default values for missing fields | |
default_values = { | |
"exam_type": exam_type, | |
"content_type": "Generated", | |
"exam_section": exam_type.lower(), | |
"domain": "General", | |
"subdomain": "General", | |
"topic": "General", | |
"difficulty_level": "Medium", | |
"reading_passage_title": None, | |
"is_active": True, | |
"source_file": source_file, | |
"source_text": text # Add default source text | |
} | |
for q_idx, q in enumerate(questions, 1): | |
# Add default values for missing fields | |
for field, default_value in default_values.items(): | |
if field not in q or q[field] is None: | |
q[field] = default_value | |
validation_errors = [] | |
# Required fields that must have non-empty values | |
critical_fields = [ | |
"question_text", | |
"option_a", | |
"option_b", | |
"option_c", | |
"option_d", | |
"correct_answer", | |
"explanation", | |
"source_text" # Add source_text as a critical field | |
] | |
# Validate critical fields | |
missing_fields = [f for f in critical_fields if not q.get(f)] | |
if missing_fields: | |
validation_errors.append(f"Missing critical fields: {missing_fields}") | |
# Validate field lengths | |
if len(q.get("question_text", "")) < 20: | |
validation_errors.append("Question text too short (min 20 chars)") | |
# Check if this is a math question | |
is_math = any(math_term in q.get('domain', '').lower() for math_term in ['math', 'algebra', 'geometry', 'calculus', 'arithmetic']) | |
# Validate correct answer format - only for non-math questions | |
if not is_math and q.get("correct_answer") not in ["A", "B", "C", "D"]: | |
validation_errors.append("Invalid correct_answer format (must be A, B, C, or D)") | |
# For math questions, just ensure there is a correct answer | |
if is_math and not q.get("correct_answer"): | |
validation_errors.append("Missing correct answer") | |
# Validate difficulty level | |
if q.get("difficulty_level") not in DIFFICULTY_LEVELS: | |
q["difficulty_level"] = "Medium" # Set default if invalid | |
if validation_errors: | |
invalid_count += 1 | |
error_msg = f"Question {q_idx} validation failed: {', '.join(validation_errors)}" | |
logging.warning(error_msg) | |
with st.expander(f"β οΈ Question {q_idx} Validation Issues", expanded=False): | |
st.warning(error_msg) | |
else: | |
valid_questions.append(q) | |
logging.info(f"Question {q_idx} passed validation") | |
# Update validation progress | |
validation_progress.progress(q_idx / len(questions)) | |
validation_status.info(f"π Validating questions... ({q_idx}/{len(questions)})") | |
progress.progress(100) | |
if not valid_questions: | |
error_msg = f"No valid questions were generated. {invalid_count} questions failed validation." | |
logging.error(error_msg) | |
status.error(error_msg) | |
return [], 0.0 | |
validation_status.success(f"β Successfully validated {len(valid_questions)} questions out of {len(questions)}") | |
# Calculate and log cost | |
input_tokens = len(prompt) / 4 # Rough estimate: 4 chars per token | |
output_tokens = len(content) / 4 | |
# o3-mini pricing: | |
# Input: $1.10 per 1M tokens | |
# Output: $4.40 per 1M tokens | |
text_cost = (input_tokens / 1_000_000 * 1.10) + (output_tokens / 1_000_000 * 4.40) | |
logging.info(f"Estimated cost for this chunk: ${text_cost:.6f}") | |
st.success(f"β¨ Generated {len(valid_questions)} valid questions (Cost: ${text_cost:.6f})") | |
return valid_questions, text_cost | |
except (json.JSONDecodeError, ValueError) as e: | |
error_msg = f"JSON parsing error: {str(e)}" | |
logging.error(f"{error_msg}\nResponse content: {content}") | |
status.error(error_msg) | |
# Log the problematic content for debugging | |
with st.expander("Show problematic response"): | |
st.code(content) | |
return [], 0.0 | |
except Exception as e: | |
error_msg = f"Error processing text: {str(e)}" | |
logging.error(error_msg) | |
status.error(error_msg) | |
return [], 0.0 | |
def process_chunk(chunk: str, exam_type: str, idx: int, structure: str) -> (List[Dict[str, Any]], float): | |
""" | |
Process a single text chunk by first cleaning the text and then generating exam questions in a single LLM call. | |
This reduces the cost by combining cleaning and generation into one request. | |
Returns a tuple (valid_questions, chunk_cost). | |
""" | |
# Combined prompt that instructs the model to do two tasks: clean the text and then generate multiple exam questions. | |
combined_prompt = f""" | |
You are an expert text cleaner and exam question generator. First, clean and format the following text (fixing OCR issues and spacing) while preserving its exact meaning. | |
Then, based on the cleaned text, generate ALL possible exam questions. Extract every testable concept and create a comprehensive set of questions. Do not limit the number of questions - generate a question for every distinct piece of information or concept in the text. | |
Exam Question JSON Structure: | |
{{ | |
"exam_type": "{exam_type}", | |
"content_type": "Generated", | |
"exam_section": "{exam_type.lower()}", | |
"domain": "domain_from_structure", | |
"subdomain": "subdomain_from_structure", | |
"topic": "topic_from_structure", | |
"difficulty_level": "one_of[Easy,Medium,Hard,Very Hard]", | |
"reading_passage": "complete_passage_text", | |
"reading_passage_title": "title_or_null", | |
"question_text": "question_text", | |
"option_a": "first_option", | |
"option_b": "second_option", | |
"option_c": "third_option", | |
"option_d": "fourth_option", | |
"correct_answer": "one_of[A,B,C,D]", | |
"explanation": "detailed_explanation", | |
"is_active": true | |
}} | |
Domain Structure: | |
{structure} | |
Text to process: | |
{chunk} | |
Return ONLY a valid JSON object with an array of questions under the key "questions" and no additional explanation. | |
Please provide the response in valid JSON format. | |
""" | |
try: | |
response = client.chat.completions.create( | |
model="o3-mini", | |
messages=[ | |
{"role": "user", "content": combined_prompt}, | |
], | |
response_format={"type": "json_object"} # Request JSON response format | |
) | |
content = response.choices[0].message.content.strip() | |
# Estimate tokens (rough conversion: assume 1 token ~ 4 characters) | |
input_tokens = len(combined_prompt) / 4 | |
output_tokens = len(content) / 4 | |
# o3-mini pricing: | |
# Input: $1.10 per 1M tokens | |
# Output: $4.40 per 1M tokens | |
chunk_cost = (input_tokens / 1_000_000 * 1.10) + (output_tokens / 1_000_000 * 4.40) | |
try: | |
# Parse JSON response | |
parsed_data = json.loads(content) | |
questions = parsed_data.get("questions", []) | |
# Validate each question with the same checks. | |
required_fields = [ | |
"exam_type", "content_type", "exam_section", "domain", "subdomain", | |
"topic", "difficulty_level", "reading_passage", "question_text", | |
"option_a", "option_b", "option_c", "option_d", "correct_answer", | |
"explanation", "is_active" | |
] | |
valid_questions = [] | |
for q in questions: | |
missing_fields = [f for f in required_fields if f not in q or not q[f]] | |
if missing_fields: | |
logging.warning(f"Question missing required fields: {missing_fields}") | |
continue | |
if len(q["reading_passage"]) < 100: | |
logging.warning("Reading passage too short") | |
continue | |
if len(q["question_text"]) < 20: | |
logging.warning("Question text too short") | |
continue | |
if len(q["explanation"]) < 50: | |
logging.warning("Explanation too short") | |
continue | |
if q["correct_answer"] not in ["A", "B", "C", "D"]: | |
logging.warning("Invalid correct_answer format") | |
continue | |
valid_questions.append(q) | |
if len(valid_questions) < 3: | |
logging.warning(f"Generated only {len(valid_questions)} valid questions, expected at least 3") | |
return [], chunk_cost | |
return valid_questions, chunk_cost | |
except json.JSONDecodeError as je: | |
logging.error(f"JSON parsing error in chunk {idx + 1}: {str(je)}") | |
return [], chunk_cost | |
except Exception as e: | |
logging.error(f"Error processing response: {str(e)}") | |
return [], chunk_cost | |
except Exception as e: | |
logging.error(f"Error processing chunk {idx + 1}: {str(e)}") | |
safe_st_error(f"Error generating questions for chunk {idx + 1}: {str(e)}") | |
return [], 0.0 | |
def generate_questions(text_chunks: List[str], exam_type: str) -> (List[Dict[str, Any]], float): | |
""" | |
Generates questions for each text chunk using concurrent processing. | |
Returns a tuple (questions, total_cost) where total_cost is the estimated GPT cost. | |
""" | |
all_questions = [] | |
total_cost = 0.0 | |
structure = domain_structures.get(exam_type, "") | |
# Create progress tracking elements in the main thread | |
progress_placeholder = st.empty() | |
status_placeholder = st.empty() | |
metrics_placeholder = st.empty() | |
# Process chunks concurrently | |
with ThreadPoolExecutor() as executor: | |
futures = [ | |
executor.submit(process_chunk, chunk, exam_type, idx, structure) | |
for idx, chunk in enumerate(text_chunks) | |
] | |
completed = 0 | |
total = len(text_chunks) | |
total_questions = 0 | |
# Process results as they complete | |
for future in as_completed(futures): | |
try: | |
chunk_questions, chunk_cost = future.result() | |
all_questions.extend(chunk_questions) | |
total_cost += chunk_cost | |
total_questions += len(chunk_questions) | |
# Update progress in the main thread | |
completed += 1 | |
progress = completed / total | |
# Update UI elements | |
progress_placeholder.progress(progress) | |
status_placeholder.text(f"Processing chunks: {completed}/{total}") | |
metrics_placeholder.metric( | |
label="Progress", | |
value=f"{completed}/{total} chunks", | |
delta=f"{total_questions} questions generated" | |
) | |
# Process any queued messages | |
while not log_queue.empty(): | |
msg_type, message = log_queue.get() | |
if msg_type == "warning": | |
st.warning(message) | |
elif msg_type == "error": | |
st.error(message) | |
except Exception as e: | |
st.error(f"Error processing chunk: {str(e)}") | |
# Show final summary | |
st.success(f"β Processing complete! Generated {total_questions} questions from {total} chunks. (Estimated cost: ${total_cost:.6f})") | |
# Clear progress tracking elements | |
progress_placeholder.empty() | |
status_placeholder.empty() | |
metrics_placeholder.empty() | |
return all_questions, total_cost | |
def upload_questions_to_supabase(generated_questions: List[Dict[str, Any]], source_file: str): | |
""" | |
Uploads generated questions to Supabase. | |
Args: | |
generated_questions: List of question dictionaries. | |
source_file: Name of the source PDF file. | |
""" | |
# Create a container for upload progress | |
with st.expander("π€ Uploading Questions", expanded=True): | |
st.markdown("### Upload Progress") | |
# Create metrics for upload stats | |
col1, col2, col3 = st.columns(3) | |
with col1: | |
total_metric = st.metric("Total Questions", len(generated_questions)) | |
with col2: | |
success_metric = st.metric("Uploaded", "0") | |
with col3: | |
failed_metric = st.metric("Failed", "0") | |
# Progress bar and status | |
progress = st.progress(0) | |
status = st.empty() | |
total = len(generated_questions) | |
successful_uploads = 0 | |
failed_uploads = 0 | |
for idx, question in enumerate(generated_questions): | |
status.info(f"π€ Uploading question {idx+1}/{total}") | |
# Generate a new valid UUID regardless of what was provided | |
new_uuid = str(uuid.uuid4()) | |
# Set default values if not present and match the table schema | |
question_fields = { | |
"id": new_uuid, | |
"exam_type": question.get("exam_type", "Unknown"), | |
"content_type": question.get("content_type", "Generated"), | |
"exam_section": question.get("exam_section") or question.get("exam_type", "Unknown").lower(), | |
"domain": question.get("domain", "General"), | |
"subdomain": question.get("subdomain", "General"), | |
"topic": question.get("topic", "General"), | |
"difficulty_level": question.get("difficulty_level"), | |
"reading_passage": question.get("reading_passage"), | |
"question_text": question.get("question_text", "Not Available"), | |
"option_a": question.get("option_a"), | |
"option_b": question.get("option_b"), | |
"option_c": question.get("option_c"), | |
"option_d": question.get("option_d"), | |
"correct_answer": question.get("correct_answer", "Not Available"), | |
"explanation": question.get("explanation"), | |
"source_file": source_file, | |
"is_active": question.get("is_active", True), | |
"is_fixed": False, | |
"metadata": json.dumps(question.get("metadata")) if question.get("metadata") else None, | |
"source_text": question.get("source_text") | |
} | |
try: | |
# Insert the question and get the response | |
response = supabase.table("exam_contents").insert(question_fields).execute() | |
# Check if the response data indicates success | |
if response.data: | |
successful_uploads += 1 | |
success_metric.metric("Uploaded", str(successful_uploads), delta=1) | |
else: | |
failed_uploads += 1 | |
failed_metric.metric("Failed", str(failed_uploads), delta=1) | |
with st.expander(f"β οΈ Upload Issue - Question {idx+1}", expanded=False): | |
st.warning(f"Failed to insert question: {response.error}") | |
except Exception as e: | |
failed_uploads += 1 | |
failed_metric.metric("Failed", str(failed_uploads), delta=1) | |
with st.expander(f"β Upload Error - Question {idx+1}", expanded=False): | |
st.error(f"Error uploading question: {str(e)}") | |
# Update progress | |
progress.progress((idx + 1) / total) | |
# Show final upload summary | |
if failed_uploads == 0: | |
status.success(f"β Upload complete! Successfully uploaded all {successful_uploads} questions.") | |
else: | |
status.warning(f"β οΈ Upload complete with some issues. Successful: {successful_uploads}, Failed: {failed_uploads}") | |
def split_text_into_chunks(text: str, max_chunk_size: int = 20000) -> List[str]: | |
# Ensure that text is a string before processing. | |
if not isinstance(text, str): | |
try: | |
text = text.decode("utf-8") | |
except Exception: | |
text = str(text) | |
# Remove any leading/trailing whitespace. | |
text = text.strip() | |
total_length = len(text) | |
# Split the text into fixed-size chunks using slicing. | |
chunks = [text[i:i+max_chunk_size] for i in range(0, total_length, max_chunk_size)] | |
logging.info(f"Split text into {len(chunks)} chunks of up to {max_chunk_size} characters each.") | |
return chunks | |
def check_duplicate_pdf(pdf_file) -> bool: | |
""" | |
Check if a PDF file has already been processed by comparing its name with existing source files. | |
Returns True if the file is a duplicate, False otherwise. | |
""" | |
try: | |
existing_files = get_unique_source_files() | |
return pdf_file.name in existing_files | |
except Exception as e: | |
logging.error(f"Error checking for duplicate PDF: {str(e)}") | |
return False | |
def process_pdfs(pdf_files, exam_type): | |
""" | |
Process multiple PDF files and generate questions. | |
""" | |
# Create a container for logs | |
log_container = st.container() | |
with log_container: | |
st.subheader("Processing Logs") | |
log_output = st.empty() | |
all_questions = [] | |
overall_cost = 0.0 | |
progress_text = st.empty() | |
progress_bar = st.progress(0) | |
structure = domain_structures.get(exam_type, "") | |
# Check for duplicates before processing | |
duplicate_files = [] | |
for pdf_file in pdf_files: | |
if check_duplicate_pdf(pdf_file): | |
duplicate_files.append(pdf_file.name) | |
if duplicate_files: | |
st.warning(f"The following files have already been processed:\n" + | |
"\n".join(f"- {file}" for file in duplicate_files)) | |
# Filter out duplicate files | |
pdf_files = [f for f in pdf_files if f.name not in duplicate_files] | |
if not pdf_files: | |
st.error("No new files to process. Please upload different PDF files.") | |
return None, None | |
for i, pdf_file in enumerate(pdf_files): | |
file_msg = f"Processing file {i+1}/{len(pdf_files)}: {pdf_file.name}" | |
progress_text.text(file_msg) | |
logging.info(file_msg) | |
try: | |
# Read the file content directly from the UploadedFile object | |
pdf_content = pdf_file.getvalue() | |
pdf_file_obj = BytesIO(pdf_content) | |
# Extract text | |
full_text = extract_text_from_pdf(pdf_file_obj) | |
if not full_text: | |
warning_msg = f"No text extracted from {pdf_file.name}" | |
logging.warning(warning_msg) | |
st.warning(warning_msg) | |
continue | |
# Log the size of extracted text | |
logging.info(f"Extracted {len(full_text)} characters from {pdf_file.name}") | |
try: | |
# Split text into smaller chunks based on question sets | |
chunks = split_text_into_chunks(full_text) | |
chunk_msg = f"Split {pdf_file.name} into {len(chunks)} chunks" | |
logging.info(chunk_msg) | |
st.info(chunk_msg) | |
# Log more details about chunks | |
for idx, chunk in enumerate(chunks): | |
logging.info(f"Chunk {idx+1} contains {chunk.count('Question')} potential questions") | |
logging.info(f"Chunk {idx+1} size: {len(chunk)} characters") | |
chunk_progress = st.progress(0) | |
chunk_status = st.empty() | |
# Process each chunk | |
for chunk_idx, chunk in enumerate(chunks): | |
chunk_msg = f"Processing chunk {chunk_idx + 1}/{len(chunks)} of {pdf_file.name}" | |
chunk_status.text(chunk_msg) | |
logging.info(chunk_msg) | |
# Process the chunk | |
chunk_questions, chunk_cost = process_text(chunk, exam_type, structure, pdf_file.name) | |
overall_cost += chunk_cost | |
if chunk_questions: | |
all_questions.extend(chunk_questions) | |
success_msg = f"Generated {len(chunk_questions)} questions from chunk {chunk_idx + 1}" | |
logging.info(success_msg) | |
st.success(success_msg) | |
# Upload chunk questions | |
upload_msg = f"Uploading {len(chunk_questions)} questions to database..." | |
logging.info(upload_msg) | |
st.text(upload_msg) | |
upload_questions_to_supabase(chunk_questions, pdf_file.name) | |
else: | |
warning_msg = f"No valid questions generated from chunk {chunk_idx + 1}" | |
logging.warning(warning_msg) | |
st.warning(warning_msg) | |
chunk_progress.progress((chunk_idx + 1) / len(chunks)) | |
# Update log display | |
log_output.text_area("Processing Logs", value=log_handler.get_logs(), height=200) | |
except Exception as e: | |
error_msg = f"Error processing {pdf_file.name}: {str(e)}" | |
logging.error(error_msg) | |
st.error(error_msg) | |
except Exception as e: | |
st.error(f"Error processing file {pdf_file.name}: {str(e)}") | |
# Update overall progress | |
progress_bar.progress((i + 1) / len(pdf_files)) | |
# Final summary | |
if all_questions: | |
success_msg = f"Successfully generated {len(all_questions)} questions total. Total cost: ${overall_cost:.6f}" | |
logging.info(success_msg) | |
st.success(success_msg) | |
# Create the JSON output | |
questions_json = json.dumps(all_questions, indent=4) | |
return questions_json, questions_json.encode('utf-8') | |
else: | |
warning_msg = "No questions were generated from any of the files." | |
logging.warning(warning_msg) | |
st.warning(warning_msg) | |
return None, None | |
def get_questions(filters=None): | |
"""Fetch questions from Supabase with optional filters.""" | |
try: | |
# Initialize an empty list to store all questions | |
all_questions = [] | |
page_size = 1000 # Supabase default page size | |
current_start = 0 | |
while True: | |
# Build the query with pagination | |
query = supabase.table("exam_contents").select("*").range(current_start, current_start + page_size - 1) | |
# Apply filters if any | |
if filters: | |
for key, value in filters.items(): | |
if value and value != "All": | |
query = query.eq(key, value) | |
# Execute query | |
response = query.execute() | |
# If no data returned, break the loop | |
if not response.data: | |
break | |
# Add the current page's data to our results | |
all_questions.extend(response.data) | |
# If we got less than a full page, we're done | |
if len(response.data) < page_size: | |
break | |
# Move to next page | |
current_start += page_size | |
logging.info(f"Retrieved total of {len(all_questions)} questions from database") | |
return all_questions | |
except Exception as e: | |
logging.error(f"Error fetching questions: {e}") | |
st.error(f"Database error: {str(e)}") | |
return [] | |
def get_analytics_data(questions): | |
"""Generate analytics data from questions.""" | |
df = pd.DataFrame(questions) | |
analytics = { | |
'total_questions': len(df), | |
'active_questions': len([q for q in questions if q.get('is_active', True)]), | |
'inactive_questions': len([q for q in questions if not q.get('is_active', True)]), | |
'unfixed_questions': len([q for q in questions if not q.get('is_fixed', False)]) | |
} | |
# Basic statistics | |
if 'exam_type' in df.columns: | |
analytics['questions_by_exam'] = df['exam_type'].value_counts() | |
else: | |
analytics['questions_by_exam'] = pd.Series(dtype='int64') | |
if 'difficulty_level' in df.columns: | |
analytics['questions_by_difficulty'] = df['difficulty_level'].value_counts() | |
else: | |
analytics['questions_by_difficulty'] = pd.Series(dtype='int64') | |
if 'domain' in df.columns: | |
analytics['questions_by_domain'] = df['domain'].value_counts() | |
else: | |
analytics['questions_by_domain'] = pd.Series(dtype='int64') | |
# Include exam_type in the domain/subdomain grouping | |
if all(col in df.columns for col in ['exam_type', 'domain', 'subdomain']): | |
analytics['questions_by_subdomain'] = df.groupby(['exam_type', 'domain', 'subdomain']).size().reset_index(name='count') | |
else: | |
analytics['questions_by_subdomain'] = pd.DataFrame(columns=['exam_type', 'domain', 'subdomain', 'count']) | |
# Time-based analytics | |
if 'created_at' in df.columns: | |
df['created_at'] = pd.to_datetime(df['created_at']) | |
analytics['questions_by_date'] = df.resample('D', on='created_at').size() | |
analytics['questions_by_month'] = df.resample('M', on='created_at').size() | |
analytics['recent_activity'] = df.sort_values('created_at', ascending=False).head(10) | |
# Content coverage analysis | |
if 'reading_passage' in df.columns: | |
analytics['has_passage'] = df['reading_passage'].notna().sum() | |
analytics['passage_ratio'] = (df['reading_passage'].notna().sum() / len(df)) * 100 if len(df) > 0 else 0 | |
# Calculate average passage length | |
df['passage_length'] = df['reading_passage'].str.len().fillna(0) | |
analytics['avg_passage_length'] = df['passage_length'].mean() | |
analytics['passage_length_dist'] = df['passage_length'].describe() | |
# Question quality metrics | |
if 'explanation' in df.columns: | |
analytics['has_explanation'] = df['explanation'].notna().sum() | |
analytics['explanation_ratio'] = (df['explanation'].notna().sum() / len(df)) * 100 if len(df) > 0 else 0 | |
# Calculate explanation comprehensiveness | |
df['explanation_length'] = df['explanation'].str.len().fillna(0) | |
analytics['avg_explanation_length'] = df['explanation_length'].mean() | |
analytics['explanation_length_dist'] = df['explanation_length'].describe() | |
# Options analysis | |
option_cols = ['option_a', 'option_b', 'option_c', 'option_d'] | |
if all(col in df.columns for col in option_cols): | |
df['options_count'] = df[option_cols].notna().sum(axis=1) | |
analytics['complete_options'] = (df['options_count'] == 4).sum() | |
analytics['options_ratio'] = (analytics['complete_options'] / len(df)) * 100 if len(df) > 0 else 0 | |
# Domain coverage analysis | |
if 'domain' in df.columns: | |
domain_coverage = df.groupby(['domain'])['subdomain'].nunique().reset_index() | |
domain_coverage.columns = ['domain', 'unique_subdomains'] | |
analytics['domain_coverage'] = domain_coverage | |
# Calculate domain balance score (0-100) per exam type | |
domain_balance_scores = [] | |
for exam_type in df['exam_type'].unique(): | |
exam_domain_counts = df[df['exam_type'] == exam_type]['domain'].value_counts() | |
if not exam_domain_counts.empty: | |
max_count = exam_domain_counts.max() | |
min_count = exam_domain_counts.min() | |
score = ((1 - (max_count - min_count) / max_count) * 100) if max_count > 0 else 100 | |
domain_balance_scores.append({'exam_type': exam_type, 'balance_score': score}) | |
analytics['domain_balance_by_exam'] = pd.DataFrame(domain_balance_scores) | |
analytics['domain_balance_score'] = analytics['domain_balance_by_exam']['balance_score'].mean() | |
return analytics | |
def rewrite_question(question: Dict[str, Any], prompt: str = "") -> Dict[str, Any]: | |
""" | |
Use LLM to rewrite the question, passage, and options while maintaining the same concept. | |
""" | |
base_prompt = """Rewrite the following exam question with a new passage and options. Keep the same concept, difficulty level, and correct answer position, but create fresh content.""" | |
# Add custom prompt if provided | |
if prompt: | |
base_prompt += f"\n\nSpecial Instructions: {prompt}" | |
prompt = f"""{base_prompt} | |
Current Question: | |
Reading Passage: {question.get('reading_passage', '')} | |
Question: {question.get('question_text', '')} | |
Options: | |
A) {question.get('option_a', '')} | |
B) {question.get('option_b', '')} | |
C) {question.get('option_c', '')} | |
D) {question.get('option_d', '')} | |
Correct Answer: {question.get('correct_answer', '')} | |
Explanation: {question.get('explanation', '')} | |
IMPORTANT LENGTH REQUIREMENTS: | |
- Reading passage must be AT LEAST 100 characters (preferably 200-300) | |
- Question text must be AT LEAST 50 characters | |
- Options can be concise but clear (no minimum length) | |
- Explanation must be AT LEAST 50 characters | |
Requirements: | |
1. Create a new reading passage that: | |
- Must be AT LEAST 100 characters (preferably 200-300) | |
- Covers the same concepts in detail | |
- Maintains similar complexity | |
- Uses rich context and examples | |
{"- Incorporates the special instructions provided above" if prompt else ""} | |
2. Write a detailed question that: | |
- Must be AT LEAST 50 characters | |
- Clearly states what is being asked | |
- Includes necessary context | |
3. Create clear options that: | |
- Are concise but clear | |
- Are distinct from each other | |
- Follow a similar format | |
- Maintain the correct answer in the same position | |
4. Write a good explanation that: | |
- Must be AT LEAST 50 characters | |
- Explains the correct answer | |
- Provides clear reasoning | |
- References the passage when relevant | |
Return ONLY a JSON object with the following structure: | |
{{ | |
"reading_passage": "new_passage (MINIMUM 100 characters)", | |
"question_text": "new_question (MINIMUM 50 characters)", | |
"option_a": "new_option_a (concise)", | |
"option_b": "new_option_b (concise)", | |
"option_c": "new_option_c (concise)", | |
"option_d": "new_option_d (concise)", | |
"explanation": "new_explanation (MINIMUM 50 characters)" | |
}}""" | |
try: | |
response = client.chat.completions.create( | |
model="o3-mini", | |
messages=[ | |
{ | |
"role": "system", | |
"content": "You are an expert at rewriting exam questions. Create a detailed reading passage (100+ chars) and clear question (50+ chars). Options should be concise but clear. Explanation should be thorough (50+ chars)." | |
}, | |
{"role": "user", "content": prompt} | |
], | |
temperature=0.7, | |
response_format={"type": "json_object"} # Request JSON response format | |
) | |
# Parse the response | |
new_content = json.loads(response.choices[0].message.content) | |
# Validate minimum length requirements with detailed error messages | |
length_requirements = { | |
'reading_passage': 100, | |
'question_text': 50, | |
'explanation': 50 | |
} | |
errors = [] | |
for key, min_length in length_requirements.items(): | |
value = new_content.get(key, '') | |
current_length = len(value) | |
if current_length < min_length: | |
errors.append(f"{key} is too short: {current_length} chars (minimum {min_length} required)") | |
if errors: | |
error_message = "\n".join(errors) | |
raise ValueError(f"Content length requirements not met:\n{error_message}") | |
# Update the question with new content while preserving other fields | |
updated_question = question.copy() | |
updated_question.update(new_content) | |
# Calculate and log cost | |
input_tokens = (len(system_message) + len(prompt)) / 4 # Rough estimate: 4 chars per token | |
output_tokens = len(content) / 4 | |
# o3-mini pricing: | |
# Input: $1.10 per 1M tokens | |
# Output: $4.40 per 1M tokens | |
rewrite_cost = (input_tokens / 1_000_000 * 1.10) + (output_tokens / 1_000_000 * 4.40) | |
logging.info(f"Estimated cost for rewriting this question: ${rewrite_cost:.6f}") | |
return updated_question | |
except json.JSONDecodeError as je: | |
error_msg = f"Invalid JSON response from LLM: {str(je)}" | |
logging.error(error_msg) | |
raise ValueError(error_msg) | |
except Exception as e: | |
logging.error(f"Error rewriting question: {str(e)}") | |
raise e | |
def display_question(question, index): | |
"""Display a single question with its details.""" | |
with st.expander(f"Question {index + 1}", expanded=index == 0): | |
# Add delete and rewrite buttons in the top right corner | |
col1, col2, col3 = st.columns([5, 1, 1]) | |
# Add prompt input field | |
prompt = st.text_area( | |
"Rewrite Instructions", | |
value="", | |
placeholder="Enter specific instructions for rewriting this question (e.g., 'include text about renewable energy' or 'make it about space exploration')", | |
key=f"prompt_{question['id']}" | |
) | |
with col2: | |
if st.button("π Rewrite", key=f"rewrite_{question['id']}", type="primary"): | |
try: | |
with st.spinner("Rewriting question..."): | |
# Rewrite the question with the prompt | |
updated_question = rewrite_question(question, prompt) | |
# Update in Supabase | |
supabase.table("exam_contents").update(updated_question).eq("id", question['id']).execute() | |
st.success("Question rewritten successfully!") | |
# Refresh the page | |
st.rerun() | |
except Exception as e: | |
st.error(f"Error rewriting question: {str(e)}") | |
with col3: | |
if st.button("ποΈ Delete", key=f"delete_{question['id']}", type="secondary"): | |
try: | |
# Delete from Supabase | |
supabase.table("exam_contents").delete().eq("id", question['id']).execute() | |
st.success("Question deleted successfully!") | |
# Add a rerun to refresh the page | |
st.rerun() | |
except Exception as e: | |
st.error(f"Error deleting question: {str(e)}") | |
# Metadata | |
with col1: | |
col_a, col_b, col_c, col_d, col_e = st.columns(5) | |
with col_a: | |
st.markdown(f"**Domain:** {question.get('domain', 'N/A')}") | |
with col_b: | |
st.markdown(f"**Subdomain:** {question.get('subdomain', 'N/A')}") | |
with col_c: | |
st.markdown(f"**Topic:** {question.get('topic', 'N/A')}") | |
with col_d: | |
st.markdown(f"**Difficulty:** {question.get('difficulty_level', 'N/A')}") | |
with col_e: | |
st.markdown(f"**Source:** {question.get('source_file', 'N/A')}") | |
# Source text if available | |
if question.get('source_text'): | |
st.markdown("### π Source Text") | |
st.markdown( | |
f"""<div style='background-color: #e8f4f9; padding: 20px; border-radius: 10px; margin: 10px 0; color: #1f1f1f;'> | |
{question['source_text']} | |
</div>""", | |
unsafe_allow_html=True | |
) | |
# Reading passage if available | |
if question.get('reading_passage'): | |
st.markdown("### π Reading Passage") | |
st.markdown( | |
f"""<div style='background-color: #f0f2f6; padding: 20px; border-radius: 10px; margin: 10px 0; color: #1f1f1f;'> | |
{question['reading_passage']} | |
</div>""", | |
unsafe_allow_html=True | |
) | |
# Question text and options | |
st.markdown("### β Question") | |
st.markdown(f"{question.get('question_text', '')}") | |
if any(question.get(f'option_{opt}') for opt in ['a', 'b', 'c', 'd']): | |
st.markdown("### Options") | |
options_container = st.container() | |
with options_container: | |
for opt in ['a', 'b', 'c', 'd']: | |
if question.get(f'option_{opt}'): | |
st.markdown(f"**{opt.upper()}.** {question[f'option_{opt}']}") | |
# Answer and explanation | |
st.markdown("### Answer & Explanation") | |
col1, col2 = st.columns(2) | |
with col1: | |
st.markdown( | |
f"""<div style='background-color: #e8f4ea; padding: 10px; border-radius: 5px; margin: 10px 0; color: #1f1f1f;'> | |
<strong>Correct Answer:</strong> {question.get('correct_answer', 'N/A')} | |
</div>""", | |
unsafe_allow_html=True | |
) | |
with col2: | |
if question.get('explanation'): | |
st.markdown( | |
f"""<div style='background-color: #fff3e0; padding: 10px; border-radius: 5px; color: #1f1f1f;'> | |
<strong>Explanation:</strong><br>{question['explanation']} | |
</div>""", | |
unsafe_allow_html=True | |
) | |
def display_analytics(analytics): | |
"""Display analytics visualizations.""" | |
st.markdown(""" | |
<h2 style='text-align: center; margin-bottom: 40px;'>π Analytics Dashboard</h2> | |
""", unsafe_allow_html=True) | |
# Key Metrics Overview | |
st.markdown(""" | |
<div style='text-align: center; margin-bottom: 30px;'> | |
<h3 style='color: #0f4c81;'>Key Metrics</h3> | |
</div> | |
""", unsafe_allow_html=True) | |
metrics_container = st.container() | |
with metrics_container: | |
col1, col2, col3, col4, col5 = st.columns(5) | |
with col1: | |
st.metric("π Total Questions", analytics['total_questions']) | |
with col2: | |
st.metric("β Active Questions", analytics['active_questions']) | |
with col3: | |
st.metric("β Inactive Questions", analytics['inactive_questions']) | |
with col4: | |
num_domains = len(analytics['questions_by_domain']) if not analytics['questions_by_domain'].empty else 0 | |
st.metric("π― Number of Domains", num_domains) | |
with col5: | |
if 'domain_balance_score' in analytics: | |
balance_score = f"{analytics['domain_balance_score']:.1f}%" | |
st.metric("βοΈ Domain Balance Score", balance_score) | |
# Content Quality Metrics | |
if any(key in analytics for key in ['has_explanation', 'complete_options', 'avg_passage_length']): | |
st.markdown(""" | |
<div style='text-align: center; margin: 30px 0;'> | |
<h3 style='color: #0f4c81;'>Content Quality Metrics</h3> | |
</div> | |
""", unsafe_allow_html=True) | |
quality_cols = st.columns(3) | |
with quality_cols[0]: | |
if 'explanation_ratio' in analytics: | |
st.metric("π Questions with Explanations", | |
f"{analytics['explanation_ratio']:.1f}%", | |
help="Percentage of questions that have explanations") | |
with quality_cols[1]: | |
if 'options_ratio' in analytics: | |
st.metric("β Complete Option Sets", | |
f"{analytics['options_ratio']:.1f}%", | |
help="Percentage of questions with all 4 options") | |
with quality_cols[2]: | |
if 'avg_passage_length' in analytics: | |
st.metric("π Avg Passage Length", | |
f"{int(analytics['avg_passage_length'])} chars", | |
help="Average length of reading passages") | |
# Time-based Analytics | |
if 'questions_by_date' in analytics and not analytics['questions_by_date'].empty: | |
st.markdown(""" | |
<div style='text-align: center; margin: 30px 0;'> | |
<h3 style='color: #0f4c81;'>Question Generation Timeline</h3> | |
</div> | |
""", unsafe_allow_html=True) | |
# Daily question generation trend | |
fig_timeline = px.line( | |
x=analytics['questions_by_date'].index, | |
y=analytics['questions_by_date'].values, | |
title="Daily Question Generation", | |
labels={'x': 'Date', 'y': 'Number of Questions'} | |
) | |
fig_timeline.update_layout(showlegend=False) | |
st.plotly_chart(fig_timeline, use_container_width=True) | |
# Monthly aggregation | |
if 'questions_by_month' in analytics and not analytics['questions_by_month'].empty: | |
fig_monthly = px.bar( | |
x=analytics['questions_by_month'].index, | |
y=analytics['questions_by_month'].values, | |
title="Monthly Question Generation", | |
labels={'x': 'Month', 'y': 'Number of Questions'} | |
) | |
fig_monthly.update_layout(showlegend=False) | |
st.plotly_chart(fig_monthly, use_container_width=True) | |
# Questions by Exam Type | |
if not analytics['questions_by_exam'].empty: | |
st.markdown(""" | |
<div style='text-align: center; margin: 30px 0;'> | |
<h3 style='color: #0f4c81;'>Distribution by Exam Type</h3> | |
</div> | |
""", unsafe_allow_html=True) | |
col1, col2, col3 = st.columns([1,3,1]) | |
with col2: | |
fig = px.pie( | |
values=analytics['questions_by_exam'].values, | |
names=analytics['questions_by_exam'].index, | |
hole=0.4, | |
color_discrete_sequence=px.colors.qualitative.Set3 | |
) | |
fig.update_layout( | |
showlegend=True, | |
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="center", x=0.5), | |
margin=dict(t=60, b=40, l=40, r=40) | |
) | |
st.plotly_chart(fig, use_container_width=True) | |
# Questions by Difficulty | |
if not analytics['questions_by_difficulty'].empty: | |
st.markdown(""" | |
<div style='text-align: center; margin: 30px 0;'> | |
<h3 style='color: #0f4c81;'>Distribution by Difficulty Level</h3> | |
</div> | |
""", unsafe_allow_html=True) | |
col1, col2, col3 = st.columns([1,3,1]) | |
with col2: | |
fig = px.bar( | |
x=analytics['questions_by_difficulty'].index, | |
y=analytics['questions_by_difficulty'].values, | |
color=analytics['questions_by_difficulty'].index, | |
color_discrete_sequence=px.colors.qualitative.Set2 | |
) | |
fig.update_layout( | |
showlegend=False, | |
xaxis_title="Difficulty Level", | |
yaxis_title="Number of Questions", | |
margin=dict(t=40, b=40, l=40, r=40) | |
) | |
st.plotly_chart(fig, use_container_width=True) | |
# Domain Coverage Analysis | |
if 'domain_coverage' in analytics and not analytics['domain_coverage'].empty: | |
st.markdown(""" | |
<div style='text-align: center; margin: 30px 0;'> | |
<h3 style='color: #0f4c81;'>Domain Coverage Analysis</h3> | |
</div> | |
""", unsafe_allow_html=True) | |
# Domain coverage heatmap | |
fig_coverage = px.bar( | |
analytics['domain_coverage'], | |
x='domain', | |
y='unique_subdomains', | |
title="Number of Unique Subdomains per Domain", | |
color='unique_subdomains', | |
color_continuous_scale='Viridis' | |
) | |
fig_coverage.update_layout( | |
xaxis_title="Domain", | |
yaxis_title="Number of Unique Subdomains", | |
showlegend=False | |
) | |
st.plotly_chart(fig_coverage, use_container_width=True) | |
# Questions by Domain and Subdomain | |
if not analytics['questions_by_subdomain'].empty and len(analytics['questions_by_subdomain']) > 0: | |
st.markdown(""" | |
<div style='text-align: center; margin: 30px 0;'> | |
<h3 style='color: #0f4c81;'>Distribution by Domain and Subdomain</h3> | |
</div> | |
""", unsafe_allow_html=True) | |
fig = px.treemap( | |
analytics['questions_by_subdomain'], | |
path=['exam_type', 'domain', 'subdomain'], | |
values='count', | |
color='count', | |
color_continuous_scale='Viridis' | |
) | |
fig.update_layout(margin=dict(t=30, b=30, l=30, r=30)) | |
fig.update_traces(textinfo="label+value") | |
st.plotly_chart(fig, use_container_width=True) | |
# Recent Activity | |
if 'recent_activity' in analytics and not analytics['recent_activity'].empty: | |
st.markdown(""" | |
<div style='text-align: center; margin: 30px 0;'> | |
<h3 style='color: #0f4c81;'>Recent Activity</h3> | |
</div> | |
""", unsafe_allow_html=True) | |
recent_df = analytics['recent_activity'] | |
st.dataframe( | |
recent_df[['exam_type', 'domain', 'subdomain', 'difficulty_level', 'created_at']], | |
hide_index=True, | |
column_config={ | |
'created_at': 'Timestamp', | |
'exam_type': 'Exam Type', | |
'domain': 'Domain', | |
'subdomain': 'Subdomain', | |
'difficulty_level': 'Difficulty' | |
} | |
) | |
# Add some spacing at the bottom | |
st.markdown("<br><br>", unsafe_allow_html=True) | |
def get_unique_domains(): | |
"""Get unique domains from the database.""" | |
domains = { | |
"SAT": ["Mathematics", "Reading and Writing"], | |
"IELTS": ["Reading", "Writing", "Speaking", "Listening"], | |
"TOEFL": ["Reading", "Listening", "Speaking", "Writing"] | |
} | |
return domains | |
def get_subdomains_for_domain(exam_type: str, domain: str) -> List[str]: | |
"""Get subdomains for a specific domain by parsing the domain structure.""" | |
parsed_structure = parse_domain_structure(exam_type) | |
return list(parsed_structure.get(domain, {}).keys()) | |
def parse_domain_structure(exam_type: str) -> dict: | |
"""Parse the domain structure string into a dictionary format.""" | |
structure = domain_structures.get(exam_type, "") | |
if not structure: | |
return {} | |
result = {} | |
current_domain = None | |
current_subdomain = None | |
for line in structure.split('\n'): | |
line = line.strip() | |
if not line: | |
continue | |
# Match domain (e.g., "1. Reading and Writing:") | |
if line[0].isdigit() and line.endswith(':'): | |
current_domain = line.split('.', 1)[1].split(':', 1)[0].strip() | |
result[current_domain] = {} | |
# Match subdomain (e.g., "- Information and Ideas:") | |
elif line.startswith('-'): | |
current_subdomain = line[1:].split(':', 1)[0].strip() | |
result[current_domain][current_subdomain] = [] | |
# Match topic (e.g., "* Central Ideas and Details") | |
elif line.startswith('*'): | |
if current_domain and current_subdomain: | |
topic = line[1:].strip() | |
result[current_domain][current_subdomain].append(topic) | |
return result | |
def get_topics_for_subdomain(exam_type: str, domain: str, subdomain: str) -> List[str]: | |
"""Get topics for a specific subdomain by parsing the domain structure.""" | |
parsed_structure = parse_domain_structure(exam_type) | |
return parsed_structure.get(domain, {}).get(subdomain, []) | |
def get_unique_source_files(): | |
"""Get unique source files from the database, with pagination to retrieve all records.""" | |
try: | |
source_files = set() | |
page_size = 1000 | |
current_start = 0 | |
while True: | |
response = supabase.table("exam_contents").select("source_file").range(current_start, current_start + page_size - 1).execute() | |
if not response.data: | |
break | |
for item in response.data: | |
if item.get('source_file'): | |
source_files.add(item['source_file']) | |
if len(response.data) < page_size: | |
break | |
current_start += page_size | |
return sorted(list(source_files)) | |
except Exception as e: | |
st.error(f"Error fetching source files: {str(e)}") | |
return [] | |
# Streamlit Interface | |
st.title("π PDF to Exam Questions Generator with Supabase Upload") | |
# Create tabs for different functionalities | |
tab_upload, tab_view, tab_analytics = st.tabs(["π€ Upload & Generate", "π View Questions", "π Analytics"]) | |
with tab_upload: | |
st.markdown( | |
""" | |
Upload PDF files containing exam material, select the exam type, and generate structured questions automatically. | |
The generated questions will be uploaded to your Supabase database. | |
**Supported Exam Types**: SAT, IELTS, TOEFL | |
""" | |
) | |
# File uploader and exam type selection | |
uploaded_files = st.file_uploader("π₯ Upload PDFs", type=["pdf"], accept_multiple_files=True) | |
exam_type = st.selectbox( | |
"π Select Exam Type", | |
options=["SAT", "IELTS", "TOEFL"], | |
index=0 | |
) | |
# Generate and Upload Button | |
if st.button("π Generate and Upload Questions"): | |
if not uploaded_files: | |
st.error("Please upload at least one PDF file.") | |
else: | |
with st.spinner("Processing files..."): | |
questions_json, download_content = process_pdfs(uploaded_files, exam_type) | |
if questions_json: | |
st.success(f"Successfully processed {len(uploaded_files)} files and generated questions!") | |
st.json(json.loads(questions_json)) | |
# Provide download button | |
st.download_button( | |
label="β¬οΈ Download Questions JSON", | |
data=download_content, | |
file_name=f"generated_questions_{uuid.uuid4()}.json", | |
mime="application/json" | |
) | |
with tab_view: | |
st.subheader("Question Browser") | |
# Initialize session state | |
if 'selected_domain' not in st.session_state: | |
st.session_state.selected_domain = "All" | |
if 'selected_subdomain' not in st.session_state: | |
st.session_state.selected_subdomain = "All" | |
if 'selected_topic' not in st.session_state: | |
st.session_state.selected_topic = "All" | |
# Filters | |
col1, col2, col3 = st.columns(3) | |
with col1: | |
view_exam_type = st.selectbox("Exam Type", ["All"] + EXAM_TYPES, key="view_exam_type") | |
# Get domains based on exam type | |
domains = ["All"] | |
if view_exam_type != "All": | |
domains.extend(get_unique_domains().get(view_exam_type, [])) | |
domain = st.selectbox("Domain", domains, key="domain_select") | |
# Reset subdomain when domain changes | |
if domain != st.session_state.get('last_domain'): | |
st.session_state.selected_subdomain = "All" | |
st.session_state.last_domain = domain | |
st.session_state.selected_topic = "All" | |
with col2: | |
difficulty = st.selectbox("Difficulty Level", ["All"] + DIFFICULTY_LEVELS) | |
# Get subdomains based on selected exam type and domain | |
subdomains = ["All"] | |
if domain != "All" and view_exam_type != "All": | |
subdomains.extend(get_subdomains_for_domain(view_exam_type, domain)) | |
subdomain = st.selectbox("Subdomain", subdomains, key="subdomain_select") | |
# Get topics based on selected exam type, domain, and subdomain | |
topics = ["All"] | |
if subdomain != "All" and domain != "All" and view_exam_type != "All": | |
topics.extend(get_topics_for_subdomain(view_exam_type, domain, subdomain)) | |
topic = st.selectbox("Topic", topics, key="topic_select") | |
with col3: | |
# Add source file filter | |
source_files = ["All"] + get_unique_source_files() | |
source_file = st.selectbox("π Source Book/PDF", source_files, help="Filter questions by their source PDF file") | |
# Apply filters | |
filters = { | |
'exam_type': view_exam_type if view_exam_type != "All" else None, | |
'difficulty_level': difficulty if difficulty != "All" else None, | |
'domain': domain if domain != "All" else None, | |
'subdomain': subdomain if subdomain != "All" else None, | |
'topic': topic if topic != "All" else None, | |
'source_file': source_file if source_file != "All" else None | |
} | |
# Remove None values from filters | |
filters = {k: v for k, v in filters.items() if v is not None} | |
# Get filtered questions | |
questions = get_questions(filters) | |
if not questions: | |
st.info("No questions found matching the selected filters.") | |
else: | |
st.success(f"Found {len(questions)} questions") | |
# Add search functionality | |
search_query = st.text_input("π Search questions", placeholder="Enter keywords to search in questions, passages, or options...") | |
if search_query: | |
# Filter questions based on search query | |
filtered_questions = [] | |
search_terms = search_query.lower().split() | |
for question in questions: | |
searchable_text = ( | |
f"{question.get('question_text', '')} " | |
f"{question.get('reading_passage', '')} " | |
f"{question.get('option_a', '')} " | |
f"{question.get('option_b', '')} " | |
f"{question.get('option_c', '')} " | |
f"{question.get('option_d', '')}" | |
).lower() | |
# Check if all search terms are present in the searchable text | |
if all(term in searchable_text for term in search_terms): | |
filtered_questions.append(question) | |
questions = filtered_questions | |
if not questions: | |
st.warning(f"No questions found matching the search term: '{search_query}'") | |
else: | |
st.success(f"Found {len(questions)} questions matching your search") | |
# Pagination | |
questions_per_page = 10 | |
if 'current_page' not in st.session_state: | |
st.session_state.current_page = 1 | |
total_pages = (len(questions) + questions_per_page - 1) // questions_per_page | |
# Calculate start and end indices for current page | |
start_idx = (st.session_state.current_page - 1) * questions_per_page | |
end_idx = min(start_idx + questions_per_page, len(questions)) | |
# Display current page questions | |
for i, question in enumerate(questions[start_idx:end_idx], start=start_idx): | |
display_question(question, i) | |
# Pagination controls | |
col1, col2, col3 = st.columns([1, 2, 1]) | |
with col1: | |
if st.session_state.current_page > 1: | |
if st.button("β Previous"): | |
st.session_state.current_page -= 1 | |
st.rerun() | |
with col2: | |
st.write(f"Page {st.session_state.current_page} of {total_pages}") | |
with col3: | |
if st.session_state.current_page < total_pages: | |
if st.button("Next β"): | |
st.session_state.current_page += 1 | |
st.rerun() | |
with tab_analytics: | |
# Get all questions for analytics | |
all_questions = get_questions() | |
analytics = get_analytics_data(all_questions) | |
# Add source file management section | |
st.markdown(""" | |
<div style='text-align: center; margin: 30px 0;'> | |
<h3 style='color: #0f4c81;'>π Source File Management</h3> | |
</div> | |
""", unsafe_allow_html=True) | |
# Get unique source files | |
source_files = get_unique_source_files() | |
if not source_files: | |
st.info("No source files found in the database.") | |
else: | |
# Create a container for the source files | |
with st.container(): | |
# Display source files in a grid | |
cols = st.columns(3) | |
for idx, source_file in enumerate(source_files): | |
col = cols[idx % 3] | |
with col: | |
# Count questions for this source file | |
question_count = len([q for q in all_questions if q.get('source_file') == source_file]) | |
# Create an expander for each source file | |
with st.expander(f"π {source_file}", expanded=False): | |
st.markdown(f"**Questions:** {question_count}") | |
# Add delete button with confirmation | |
if st.button(f"ποΈ Delete", key=f"delete_{source_file}"): | |
confirm_key = f"confirm_{source_file}" | |
if confirm_key not in st.session_state: | |
st.session_state[confirm_key] = False | |
if not st.session_state[confirm_key]: | |
st.warning(f"Are you sure you want to delete all questions from {source_file}?") | |
col1, col2 = st.columns(2) | |
with col1: | |
if st.button("β Yes", key=f"yes_{source_file}"): | |
try: | |
# Delete all questions with this source file | |
response = supabase.table("exam_contents")\ | |
.delete()\ | |
.eq("source_file", source_file)\ | |
.execute() | |
if response.data: | |
st.success(f"Successfully deleted all questions from {source_file}") | |
st.session_state[confirm_key] = True | |
# Rerun to refresh the page | |
st.rerun() | |
else: | |
st.error("Failed to delete questions") | |
except Exception as e: | |
st.error(f"Error deleting questions: {str(e)}") | |
with col2: | |
if st.button("β No", key=f"no_{source_file}"): | |
st.session_state[confirm_key] = True | |
st.rerun() | |
# Add spacing before analytics | |
st.markdown("<br><br>", unsafe_allow_html=True) | |
# Display analytics | |
display_analytics(analytics) | |
st.markdown( | |
""" | |
--- | |
**Note**: This application uses OpenAI services to generate exam questions and uploads them to Supabase. Ensure that your API credentials are correctly set in the environment variables. | |
""" | |
) |