Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- .github/workflows/update_space.yml +28 -0
- .gitignore +4 -0
- README.md +76 -8
- app.py +1500 -0
- fix.py +349 -0
- requirements.txt +9 -0
.github/workflows/update_space.yml
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Run Python script
|
2 |
+
|
3 |
+
on:
|
4 |
+
push:
|
5 |
+
branches:
|
6 |
+
- main
|
7 |
+
|
8 |
+
jobs:
|
9 |
+
build:
|
10 |
+
runs-on: ubuntu-latest
|
11 |
+
|
12 |
+
steps:
|
13 |
+
- name: Checkout
|
14 |
+
uses: actions/checkout@v2
|
15 |
+
|
16 |
+
- name: Set up Python
|
17 |
+
uses: actions/setup-python@v2
|
18 |
+
with:
|
19 |
+
python-version: '3.9'
|
20 |
+
|
21 |
+
- name: Install Gradio
|
22 |
+
run: python -m pip install gradio
|
23 |
+
|
24 |
+
- name: Log in to Hugging Face
|
25 |
+
run: python -c 'import huggingface_hub; huggingface_hub.login(token="${{ secrets.hf_token }}")'
|
26 |
+
|
27 |
+
- name: Deploy to Spaces
|
28 |
+
run: gradio deploy
|
.gitignore
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/.env
|
2 |
+
/.pytest_cache
|
3 |
+
/generated_pdfs/
|
4 |
+
main.py
|
README.md
CHANGED
@@ -1,12 +1,80 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji: 📊
|
4 |
-
colorFrom: yellow
|
5 |
-
colorTo: blue
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version: 5.13.1
|
8 |
app_file: app.py
|
9 |
-
|
|
|
10 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
-
|
|
|
1 |
---
|
2 |
+
title: rag_ielts
|
|
|
|
|
|
|
|
|
|
|
3 |
app_file: app.py
|
4 |
+
sdk: gradio
|
5 |
+
sdk_version: 4.44.1
|
6 |
---
|
7 |
+
# Exam Content Management System
|
8 |
+
|
9 |
+
A Streamlit application for managing and generating exam content using Azure OpenAI and MongoDB.
|
10 |
+
|
11 |
+
## Features
|
12 |
+
|
13 |
+
- Upload and process PDF files containing exam content
|
14 |
+
- Generate questions based on various criteria
|
15 |
+
- Store and manage exam content in MongoDB
|
16 |
+
- Support for IELTS, TOEFL, and SAT exam types
|
17 |
+
- Beautiful UI with Streamlit
|
18 |
+
|
19 |
+
## Prerequisites
|
20 |
+
|
21 |
+
- Python 3.8+
|
22 |
+
- MongoDB database
|
23 |
+
- Azure OpenAI API access
|
24 |
+
|
25 |
+
## Installation
|
26 |
+
|
27 |
+
1. Clone the repository:
|
28 |
+
```bash
|
29 |
+
git clone <repository-url>
|
30 |
+
cd <repository-directory>
|
31 |
+
```
|
32 |
+
|
33 |
+
2. Install dependencies:
|
34 |
+
```bash
|
35 |
+
pip install -r requirements.txt
|
36 |
+
```
|
37 |
+
|
38 |
+
3. Set up environment variables:
|
39 |
+
- Copy `.env.example` to `.env`
|
40 |
+
- Fill in your Azure OpenAI credentials
|
41 |
+
- Add your MongoDB connection details
|
42 |
+
|
43 |
+
## MongoDB Setup
|
44 |
+
|
45 |
+
1. Create a MongoDB database (either local or cloud-hosted like MongoDB Atlas)
|
46 |
+
2. Update the `.env` file with your MongoDB connection string:
|
47 |
+
```
|
48 |
+
MONGODB_URI=mongodb://username:password@host:port/database
|
49 |
+
MONGODB_DB=exam_content_db
|
50 |
+
```
|
51 |
+
|
52 |
+
## Running the Application
|
53 |
+
|
54 |
+
1. Start the Streamlit app:
|
55 |
+
```bash
|
56 |
+
streamlit run main.py
|
57 |
+
```
|
58 |
+
|
59 |
+
2. Open your browser and navigate to the URL shown in the terminal (usually http://localhost:8501)
|
60 |
+
|
61 |
+
## Usage
|
62 |
+
|
63 |
+
1. Upload Content:
|
64 |
+
- Select exam type
|
65 |
+
- Upload PDF files containing exam content
|
66 |
+
- Process the uploads
|
67 |
+
|
68 |
+
2. Generate Questions:
|
69 |
+
- Choose exam type, section, and other criteria
|
70 |
+
- Select or generate reading passages
|
71 |
+
- Generate questions
|
72 |
+
- Download generated content as PDF
|
73 |
+
|
74 |
+
## Contributing
|
75 |
+
|
76 |
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
77 |
+
|
78 |
+
## License
|
79 |
|
80 |
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
app.py
ADDED
@@ -0,0 +1,1500 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import streamlit as st
|
3 |
+
import PyPDF2
|
4 |
+
from openai import AzureOpenAI
|
5 |
+
import uuid
|
6 |
+
from typing import List, Dict, Any, Optional
|
7 |
+
from supabase import create_client, Client
|
8 |
+
from dotenv import load_dotenv
|
9 |
+
import json
|
10 |
+
import re
|
11 |
+
from io import BytesIO
|
12 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
13 |
+
from queue import Queue
|
14 |
+
import threading
|
15 |
+
from pydantic import BaseModel, Field
|
16 |
+
import logging
|
17 |
+
import pandas as pd
|
18 |
+
import plotly.express as px
|
19 |
+
import subprocess
|
20 |
+
|
21 |
+
# Set up logging
|
22 |
+
logging.basicConfig(level=logging.INFO)
|
23 |
+
|
24 |
+
# Load environment variables from .env file (if present)
|
25 |
+
load_dotenv()
|
26 |
+
|
27 |
+
# Constants
|
28 |
+
EXAM_TYPES = ["SAT", "IELTS", "TOEFL"]
|
29 |
+
DIFFICULTY_LEVELS = ["Easy", "Medium", "Hard", "Very Hard"]
|
30 |
+
|
31 |
+
class ExamQuestion(BaseModel):
|
32 |
+
exam_type: str
|
33 |
+
content_type: str = "Generated"
|
34 |
+
exam_section: str
|
35 |
+
domain: str
|
36 |
+
subdomain: str
|
37 |
+
topic: str
|
38 |
+
difficulty_level: str = "Medium"
|
39 |
+
reading_passage: str
|
40 |
+
reading_passage_title: Optional[str] = None
|
41 |
+
question_text: str
|
42 |
+
option_a: str
|
43 |
+
option_b: str
|
44 |
+
option_c: str
|
45 |
+
option_d: str
|
46 |
+
correct_answer: str
|
47 |
+
explanation: str
|
48 |
+
is_active: bool = True
|
49 |
+
|
50 |
+
class ExamQuestionResponse(BaseModel):
|
51 |
+
questions: List[ExamQuestion]
|
52 |
+
|
53 |
+
# Set up Azure OpenAI client
|
54 |
+
try:
|
55 |
+
API_KEY = os.getenv("AZURE_OPENAI_KEY")
|
56 |
+
ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
|
57 |
+
DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")
|
58 |
+
|
59 |
+
if not API_KEY or not ENDPOINT or not DEPLOYMENT_NAME:
|
60 |
+
raise ValueError("Azure OpenAI configuration is incomplete. Please set AZURE_OPENAI_KEY, AZURE_OPENAI_ENDPOINT, and AZURE_OPENAI_DEPLOYMENT_NAME in the environment variables.")
|
61 |
+
|
62 |
+
logging.info(f"Using Azure OpenAI Configuration: Endpoint={ENDPOINT}, Deployment Name={DEPLOYMENT_NAME}")
|
63 |
+
logging.info(f"Azure OpenAI Configuration: Endpoint={ENDPOINT}, Deployment Name={DEPLOYMENT_NAME}, API Key={API_KEY[:4]}... (masked)")
|
64 |
+
|
65 |
+
client = AzureOpenAI(
|
66 |
+
api_key=API_KEY,
|
67 |
+
api_version="2024-02-15-preview",
|
68 |
+
azure_endpoint=ENDPOINT
|
69 |
+
)
|
70 |
+
logging.info("Azure OpenAI client initialized successfully.")
|
71 |
+
except ValueError as ve:
|
72 |
+
logging.error(f"Configuration Error: {ve}")
|
73 |
+
st.error("Azure OpenAI configuration is incomplete. Please check the environment variables.")
|
74 |
+
except Exception as e:
|
75 |
+
logging.error(f"Failed to initialize Azure OpenAI client: {e}")
|
76 |
+
st.error(f"Failed to initialize Azure OpenAI client: {str(e)}")
|
77 |
+
|
78 |
+
# Set up Supabase client
|
79 |
+
SUPABASE_URL = os.getenv("SUPABASE_DB_URL")
|
80 |
+
SUPABASE_API_KEY = os.getenv("SUPABASE_API_KEY")
|
81 |
+
if not SUPABASE_URL or not SUPABASE_API_KEY:
|
82 |
+
raise ValueError("Supabase URL and API Key must be set in environment variables.")
|
83 |
+
supabase: Client = create_client(SUPABASE_URL, SUPABASE_API_KEY)
|
84 |
+
|
85 |
+
# Create a thread-safe queue for logging
|
86 |
+
log_queue = Queue()
|
87 |
+
|
88 |
+
def safe_st_warning(message: str):
|
89 |
+
"""Thread-safe way to queue warning messages"""
|
90 |
+
log_queue.put(("warning", message))
|
91 |
+
|
92 |
+
def safe_st_error(message: str):
|
93 |
+
"""Thread-safe way to queue error messages"""
|
94 |
+
log_queue.put(("error", message))
|
95 |
+
|
96 |
+
# Define the domain structures
|
97 |
+
domain_structures = {
|
98 |
+
"SAT": """SAT Domains and Subdomains:
|
99 |
+
1. Reading and Writing:
|
100 |
+
- Information and Ideas:
|
101 |
+
* Central Ideas and Details
|
102 |
+
* Command of Textual Evidence
|
103 |
+
* Command of Quantitative Evidence
|
104 |
+
* Inferences
|
105 |
+
* Words in Context
|
106 |
+
- Craft and Structure:
|
107 |
+
* Text Structure and Purpose
|
108 |
+
* Cross-Text Connections
|
109 |
+
* Rhetorical Synthesis
|
110 |
+
* Boundaries
|
111 |
+
* Transitions
|
112 |
+
2. Mathematics:
|
113 |
+
- Algebra:
|
114 |
+
* Linear equations in one variable
|
115 |
+
* Linear equations in two variables
|
116 |
+
* Linear functions
|
117 |
+
* Systems of two linear equations in two variables
|
118 |
+
* Linear inequalities in one or two variables
|
119 |
+
- Advanced Mathematics:
|
120 |
+
* Equivalent expressions
|
121 |
+
* Nonlinear equations in one variable and systems of equations in two variables
|
122 |
+
* Nonlinear functions
|
123 |
+
- Problem Solving and Data Analysis:
|
124 |
+
* Ratios, rates, proportional relationships, and units
|
125 |
+
* Percentages
|
126 |
+
* One-variable data: distributions and measures of center and spread
|
127 |
+
* Two-variable data: models and scatterplots
|
128 |
+
* Probability and conditional probability
|
129 |
+
* Inference from sample statistics and margin of error
|
130 |
+
* Evaluating statistical claims: observational studies and experiments
|
131 |
+
- Geometry and Trigonometry:
|
132 |
+
* Area and volume
|
133 |
+
* Lines, angles, and triangles
|
134 |
+
* Right triangles and trigonometry
|
135 |
+
* Circles""",
|
136 |
+
|
137 |
+
"IELTS": """IELTS Domains and Subdomains:
|
138 |
+
1. Reading:
|
139 |
+
- Information Location:
|
140 |
+
* Scanning for Details
|
141 |
+
* Skimming for Main Ideas
|
142 |
+
* Locating Specific Information
|
143 |
+
* Finding Supporting Evidence
|
144 |
+
- Critical Analysis:
|
145 |
+
* Author's Purpose
|
146 |
+
* Text Organization
|
147 |
+
* Opinion and Attitude
|
148 |
+
* Argument Analysis
|
149 |
+
- Vocabulary and Reference:
|
150 |
+
* Word Meaning in Context
|
151 |
+
* Reference Words
|
152 |
+
* Paraphrase Recognition
|
153 |
+
* Academic Vocabulary
|
154 |
+
2. Writing:
|
155 |
+
- Task Analysis:
|
156 |
+
* Data Interpretation
|
157 |
+
* Process Description
|
158 |
+
* Compare and Contrast
|
159 |
+
* Problem and Solution
|
160 |
+
- Essay Development:
|
161 |
+
* Argument Construction
|
162 |
+
* Evidence Support
|
163 |
+
* Coherence and Cohesion
|
164 |
+
* Academic Style
|
165 |
+
- Language Control:
|
166 |
+
* Grammar Range
|
167 |
+
* Vocabulary Usage
|
168 |
+
* Sentence Structure
|
169 |
+
* Punctuation
|
170 |
+
3. Speaking:
|
171 |
+
- Personal Expression:
|
172 |
+
* Self Introduction
|
173 |
+
* Personal Experience
|
174 |
+
* Opinion Expression
|
175 |
+
* Future Plans
|
176 |
+
- Topic Development:
|
177 |
+
* Extended Discourse
|
178 |
+
* Topic Analysis
|
179 |
+
* Example Provision
|
180 |
+
* Abstract Discussion
|
181 |
+
- Communication Skills:
|
182 |
+
* Fluency and Coherence
|
183 |
+
* Pronunciation
|
184 |
+
* Interactive Communication
|
185 |
+
* Response Relevance
|
186 |
+
4. Listening:
|
187 |
+
- Academic Understanding:
|
188 |
+
* Lecture Comprehension
|
189 |
+
* Discussion Analysis
|
190 |
+
* Main Points Identification
|
191 |
+
* Detail Recognition
|
192 |
+
- Pragmatic Understanding:
|
193 |
+
* Speaker Attitude
|
194 |
+
* Function of Utterances
|
195 |
+
* Degree of Certainty
|
196 |
+
* Speaker Relationship
|
197 |
+
- Connecting Information:
|
198 |
+
* Information Organization
|
199 |
+
* Connecting Content
|
200 |
+
* Understanding Examples
|
201 |
+
* Making Inferences
|
202 |
+
5. Speaking:
|
203 |
+
- Independent Tasks:
|
204 |
+
* Opinion Expression
|
205 |
+
* Personal Experience
|
206 |
+
* Preference Justification
|
207 |
+
* Choice Explanation
|
208 |
+
- Integrated Tasks:
|
209 |
+
* Lecture Summary
|
210 |
+
* Reading-Listening Integration
|
211 |
+
* Campus Situation Response
|
212 |
+
* Academic Topic Discussion
|
213 |
+
- Delivery Skills:
|
214 |
+
* Pronunciation
|
215 |
+
* Intonation
|
216 |
+
* Rhythm and Pacing
|
217 |
+
* Natural Flow
|
218 |
+
6. Writing:
|
219 |
+
- Independent Writing:
|
220 |
+
* Essay Organization
|
221 |
+
* Thesis Development
|
222 |
+
* Evidence Support
|
223 |
+
* Conclusion Writing
|
224 |
+
- Integrated Writing:
|
225 |
+
* Source Integration
|
226 |
+
* Information Synthesis
|
227 |
+
* Accurate Reporting
|
228 |
+
* Response Organization
|
229 |
+
- Language Control:
|
230 |
+
* Grammar Accuracy
|
231 |
+
* Vocabulary Range
|
232 |
+
* Sentence Variety
|
233 |
+
* Academic Style""",
|
234 |
+
|
235 |
+
"TOEFL": """TOEFL Domains and Subdomains:
|
236 |
+
1. Reading:
|
237 |
+
- Comprehension:
|
238 |
+
* Main Idea and Details
|
239 |
+
* Inference Making
|
240 |
+
* Author's Purpose
|
241 |
+
* Vocabulary in Context
|
242 |
+
- Analysis:
|
243 |
+
* Text Organization
|
244 |
+
* Information Integration
|
245 |
+
* Argument Evaluation
|
246 |
+
* Evidence Assessment
|
247 |
+
- Academic Skills:
|
248 |
+
* Paraphrase Recognition
|
249 |
+
* Summary Skills
|
250 |
+
* Table Completion
|
251 |
+
* Classification
|
252 |
+
2. Listening:
|
253 |
+
- Academic Understanding:
|
254 |
+
* Lecture Comprehension
|
255 |
+
* Discussion Analysis
|
256 |
+
* Main Points Identification
|
257 |
+
* Detail Recognition
|
258 |
+
- Pragmatic Understanding:
|
259 |
+
* Speaker Attitude
|
260 |
+
* Function of Utterances
|
261 |
+
* Degree of Certainty
|
262 |
+
* Speaker Relationship
|
263 |
+
- Connecting Information:
|
264 |
+
* Information Organization
|
265 |
+
* Connecting Content
|
266 |
+
* Understanding Examples
|
267 |
+
* Making Inferences
|
268 |
+
3. Speaking:
|
269 |
+
- Independent Tasks:
|
270 |
+
* Opinion Expression
|
271 |
+
* Personal Experience
|
272 |
+
* Preference Justification
|
273 |
+
* Choice Explanation
|
274 |
+
- Integrated Tasks:
|
275 |
+
* Lecture Summary
|
276 |
+
* Reading-Listening Integration
|
277 |
+
* Campus Situation Response
|
278 |
+
* Academic Topic Discussion
|
279 |
+
- Delivery Skills:
|
280 |
+
* Pronunciation
|
281 |
+
* Intonation
|
282 |
+
* Rhythm and Pacing
|
283 |
+
* Natural Flow
|
284 |
+
4. Writing:
|
285 |
+
- Independent Writing:
|
286 |
+
* Essay Organization
|
287 |
+
* Thesis Development
|
288 |
+
* Evidence Support
|
289 |
+
* Conclusion Writing
|
290 |
+
- Integrated Writing:
|
291 |
+
* Source Integration
|
292 |
+
* Information Synthesis
|
293 |
+
* Accurate Reporting
|
294 |
+
* Response Organization
|
295 |
+
- Language Control:
|
296 |
+
* Grammar Accuracy
|
297 |
+
* Vocabulary Range
|
298 |
+
* Sentence Variety
|
299 |
+
* Academic Style"""
|
300 |
+
}
|
301 |
+
|
302 |
+
def clean_text(text: str) -> str:
|
303 |
+
"""Clean extracted text from PDF."""
|
304 |
+
# Remove OCR artifacts and fix common issues
|
305 |
+
text = re.sub(r'\.{3,}', '...', text) # Replace multiple dots with ellipsis
|
306 |
+
text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with single space
|
307 |
+
text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text) # Add space between joined words
|
308 |
+
text = re.sub(r'(\d+)\.(\d+)', r'\1. \2', text) # Fix numbered lists
|
309 |
+
text = text.replace('..............', '') # Remove dot lines
|
310 |
+
text = re.sub(r'Line\s+\d+', '', text) # Remove line numbers
|
311 |
+
text = re.sub(r'Page\s+\d+', '', text) # Remove page numbers
|
312 |
+
text = re.sub(r'CONTINUE\s+\d+', '', text) # Remove continue markers
|
313 |
+
text = re.sub(r'Unauthorized.*illegal\.', '', text) # Remove copyright notices
|
314 |
+
text = text.replace('©', '(c)') # Replace copyright symbol
|
315 |
+
|
316 |
+
# Fix common OCR issues
|
317 |
+
text = re.sub(r'(?<=\d)\s+(?=\d)', '', text) # Remove spaces between numbers
|
318 |
+
text = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text) # Add space between words
|
319 |
+
text = re.sub(r'(?<=\w)\.(?=\w)', '. ', text) # Add space after period
|
320 |
+
text = re.sub(r'(?<=\w),(?=\w)', ', ', text) # Add space after comma
|
321 |
+
text = re.sub(r'arebasedonthe', ' are based on the ', text) # Fix common OCR error
|
322 |
+
text = re.sub(r'Questions\d+\-\d+', 'Questions ', text) # Clean question numbers
|
323 |
+
text = re.sub(r'\s*\n\s*', '\n', text) # Clean up newlines
|
324 |
+
text = re.sub(r'\n{3,}', '\n\n', text) # Reduce multiple newlines
|
325 |
+
|
326 |
+
# Clean up percentage signs and numbers in tables
|
327 |
+
text = re.sub(r'(\d+)\s*\.\s*(\d+)', r'\1.\2', text) # Fix decimal numbers
|
328 |
+
text = re.sub(r'(\d+)\s*%', r'\1%', text) # Fix percentage signs
|
329 |
+
|
330 |
+
return text.strip()
|
331 |
+
|
332 |
+
def extract_text_from_pdf(pdf_file) -> List[str]:
|
333 |
+
"""
|
334 |
+
Extracts text from a PDF file in overlapping 3-page chunks.
|
335 |
+
For example, for a 6-page PDF:
|
336 |
+
- Chunk 1: Pages 1-2-3
|
337 |
+
- Chunk 2: Pages 2-3-4
|
338 |
+
- Chunk 3: Pages 3-4-5
|
339 |
+
- Chunk 4: Pages 4-5-6
|
340 |
+
|
341 |
+
Args:
|
342 |
+
pdf_file: Uploaded PDF file.
|
343 |
+
|
344 |
+
Returns:
|
345 |
+
List of text chunks, each containing 3 pages with overlap.
|
346 |
+
"""
|
347 |
+
reader = PyPDF2.PdfReader(pdf_file)
|
348 |
+
num_pages = len(reader.pages)
|
349 |
+
text_chunks = []
|
350 |
+
chunk_size = 3 # Fixed size of 3 pages per chunk
|
351 |
+
|
352 |
+
# Create overlapping chunks
|
353 |
+
for chunk_start in range(0, max(1, num_pages - chunk_size + 1)):
|
354 |
+
text = ""
|
355 |
+
# Calculate end page for this chunk (inclusive)
|
356 |
+
chunk_end = min(chunk_start + chunk_size, num_pages)
|
357 |
+
|
358 |
+
# Extract text from all pages in this chunk
|
359 |
+
for page_num in range(chunk_start, chunk_end):
|
360 |
+
page = reader.pages[page_num]
|
361 |
+
page_text = page.extract_text()
|
362 |
+
if page_text:
|
363 |
+
text += f"\n--- Page {page_num + 1} ---\n{clean_text(page_text)}\n"
|
364 |
+
|
365 |
+
if text.strip(): # Ensure non-empty
|
366 |
+
text_chunks.append(clean_text(text))
|
367 |
+
|
368 |
+
return text_chunks
|
369 |
+
|
370 |
+
def clean_json_string(text: str) -> str:
|
371 |
+
"""
|
372 |
+
Clean and extract JSON from the response text.
|
373 |
+
"""
|
374 |
+
# Try to find JSON array or object pattern
|
375 |
+
json_match = re.search(r'(\[|\{).*(\]|\})', text, re.DOTALL)
|
376 |
+
if json_match:
|
377 |
+
potential_json = json_match.group(0)
|
378 |
+
# Remove any markdown code block syntax
|
379 |
+
potential_json = re.sub(r'```json\s*|\s*```', '', potential_json)
|
380 |
+
# Remove any trailing commas before closing brackets
|
381 |
+
potential_json = re.sub(r',(\s*[\}\]])', r'\1', potential_json)
|
382 |
+
return potential_json
|
383 |
+
return text
|
384 |
+
|
385 |
+
def process_chunk(chunk: str, exam_type: str, idx: int, structure: str) -> List[Dict[str, Any]]:
|
386 |
+
"""
|
387 |
+
Process a single text chunk to generate multiple questions using Azure OpenAI model with structured output.
|
388 |
+
"""
|
389 |
+
# First, clean the text using LLM
|
390 |
+
clean_text_prompt = f"""Clean and format the following text while preserving its meaning and structure.
|
391 |
+
|
392 |
+
Guidelines:
|
393 |
+
1. Fix OCR artifacts and formatting issues
|
394 |
+
2. Add proper spacing between words
|
395 |
+
3. Fix line breaks and paragraphs
|
396 |
+
4. Preserve all content including tables and data
|
397 |
+
5. Keep all numerical values and statistics
|
398 |
+
6. Maintain academic/formal style
|
399 |
+
7. Keep the text exactly as is, just make it readable
|
400 |
+
8. Do not summarize or modify the content
|
401 |
+
9. Do not add or remove information
|
402 |
+
10. Keep all citations and references
|
403 |
+
|
404 |
+
Text to clean:
|
405 |
+
{chunk}
|
406 |
+
|
407 |
+
Return ONLY the cleaned text with no additional comments or explanations."""
|
408 |
+
|
409 |
+
try:
|
410 |
+
clean_response = client.chat.completions.create(
|
411 |
+
model=DEPLOYMENT_NAME,
|
412 |
+
messages=[
|
413 |
+
{
|
414 |
+
"role": "system",
|
415 |
+
"content": "You are a text cleaning expert. Your job is to fix formatting and OCR issues while preserving the exact content and meaning of the text."
|
416 |
+
},
|
417 |
+
{"role": "user", "content": clean_text_prompt}
|
418 |
+
],
|
419 |
+
temperature=0.0, # Use 0 temperature for consistent cleaning
|
420 |
+
)
|
421 |
+
|
422 |
+
cleaned_chunk = clean_response.choices[0].message.content.strip()
|
423 |
+
|
424 |
+
# Now proceed with question generation using the cleaned text
|
425 |
+
prompt = f"""Generate multiple {exam_type} exam questions based on the provided text. You MUST generate at least 3 questions for each chunk of text. Return ONLY a JSON array of questions.
|
426 |
+
|
427 |
+
Domain Structure:
|
428 |
+
{structure}
|
429 |
+
|
430 |
+
IMPORTANT: You MUST include ALL of the following fields for each question. Missing fields will cause errors:
|
431 |
+
- exam_type: The type of exam (e.g., "{exam_type}")
|
432 |
+
- content_type: Set to "Generated"
|
433 |
+
- exam_section: The lowercase exam type (e.g., "{exam_type.lower()}")
|
434 |
+
- domain: The main domain from the structure (e.g., "Reading and Writing")
|
435 |
+
- subdomain: The subdomain from the structure (e.g., "Information and Ideas")
|
436 |
+
- topic: The specific topic from the structure (e.g., "Central Ideas and Details")
|
437 |
+
- difficulty_level: One of ["Easy", "Medium", "Hard", "Very Hard"]
|
438 |
+
- reading_passage: The COMPLETE, cleaned passage text
|
439 |
+
- reading_passage_title: The title if available, or null
|
440 |
+
- question_text: The actual question (REQUIRED)
|
441 |
+
- option_a: First option (REQUIRED)
|
442 |
+
- option_b: Second option (REQUIRED)
|
443 |
+
- option_c: Third option (REQUIRED)
|
444 |
+
- option_d: Fourth option (REQUIRED)
|
445 |
+
- correct_answer: Must be "A", "B", "C", or "D" (REQUIRED)
|
446 |
+
- explanation: Detailed explanation of the correct answer (REQUIRED)
|
447 |
+
- is_active: Set to true
|
448 |
+
|
449 |
+
Instructions for Multiple Questions:
|
450 |
+
1. You MUST generate at least 3 questions for this text chunk
|
451 |
+
2. Each question should focus on a different aspect or detail from the text
|
452 |
+
3. Use different subdomains and topics for variety
|
453 |
+
4. Vary the difficulty levels across questions
|
454 |
+
5. Make sure each question tests a different skill or concept
|
455 |
+
6. Questions should build on each other but be independently answerable
|
456 |
+
7. Use a mix of question types:
|
457 |
+
- Main idea questions
|
458 |
+
- Detail questions
|
459 |
+
- Inference questions
|
460 |
+
- Vocabulary in context
|
461 |
+
- Purpose questions
|
462 |
+
- Structure questions
|
463 |
+
- Evidence questions
|
464 |
+
|
465 |
+
For each question:
|
466 |
+
- Use the EXACT, COMPLETE passage text provided - do not modify, summarize, or shorten it
|
467 |
+
- Ensure questions are directly related to and answerable from the passage content
|
468 |
+
- Questions should test understanding of key concepts, details, or relationships presented in the passage
|
469 |
+
- All answer options should be plausible but with only one clearly correct answer
|
470 |
+
- The explanation should reference specific parts of the passage to justify the correct answer
|
471 |
+
|
472 |
+
IMPORTANT:
|
473 |
+
- You MUST return an array with AT LEAST 3 questions
|
474 |
+
- Each question must have ALL required fields
|
475 |
+
- Questions must be diverse in type and difficulty
|
476 |
+
- All questions must be directly answerable from the passage
|
477 |
+
- The reading_passage must be identical for all questions from the same chunk
|
478 |
+
|
479 |
+
Text to analyze:
|
480 |
+
\"\"\"
|
481 |
+
{cleaned_chunk}
|
482 |
+
\"\"\""""
|
483 |
+
|
484 |
+
response = client.chat.completions.create(
|
485 |
+
model=DEPLOYMENT_NAME,
|
486 |
+
messages=[
|
487 |
+
{
|
488 |
+
"role": "system",
|
489 |
+
"content": "You are an expert exam question generator. You MUST generate at least 3 complete questions with ALL required fields for each text chunk. Never omit any fields. Ensure proper JSON formatting. Vary the question types and difficulty levels."
|
490 |
+
},
|
491 |
+
{"role": "user", "content": prompt}
|
492 |
+
],
|
493 |
+
response_format={"type": "json_object"},
|
494 |
+
temperature=0.1,
|
495 |
+
)
|
496 |
+
|
497 |
+
# Parse the response content
|
498 |
+
content = response.choices[0].message.content.strip()
|
499 |
+
|
500 |
+
# Clean and parse JSON
|
501 |
+
cleaned_json = clean_json_string(content)
|
502 |
+
|
503 |
+
try:
|
504 |
+
parsed_data = json.loads(cleaned_json)
|
505 |
+
|
506 |
+
# Validate each question has all required fields before creating ExamQuestionResponse
|
507 |
+
required_fields = [
|
508 |
+
"exam_type", "content_type", "exam_section", "domain", "subdomain",
|
509 |
+
"topic", "difficulty_level", "reading_passage", "question_text",
|
510 |
+
"option_a", "option_b", "option_c", "option_d", "correct_answer",
|
511 |
+
"explanation", "is_active"
|
512 |
+
]
|
513 |
+
|
514 |
+
if isinstance(parsed_data, list):
|
515 |
+
questions = parsed_data
|
516 |
+
elif isinstance(parsed_data, dict) and "questions" in parsed_data:
|
517 |
+
questions = parsed_data["questions"]
|
518 |
+
else:
|
519 |
+
questions = [parsed_data]
|
520 |
+
|
521 |
+
# Validate each question
|
522 |
+
valid_questions = []
|
523 |
+
for q in questions:
|
524 |
+
missing_fields = [f for f in required_fields if f not in q or not q[f]]
|
525 |
+
if not missing_fields:
|
526 |
+
valid_questions.append(q)
|
527 |
+
else:
|
528 |
+
logging.warning(f"Skipping question due to missing fields: {missing_fields}")
|
529 |
+
|
530 |
+
if len(valid_questions) < 3:
|
531 |
+
logging.warning(f"Generated only {len(valid_questions)} valid questions, expected at least 3")
|
532 |
+
|
533 |
+
if valid_questions:
|
534 |
+
response_data = ExamQuestionResponse(questions=valid_questions)
|
535 |
+
return [question.model_dump() for question in response_data.questions]
|
536 |
+
else:
|
537 |
+
logging.error("No valid questions found after validation")
|
538 |
+
return []
|
539 |
+
|
540 |
+
except json.JSONDecodeError as je:
|
541 |
+
logging.error(f"JSON parsing error in chunk {idx + 1}: {str(je)}")
|
542 |
+
logging.error(f"Problematic JSON: {cleaned_json[:500]}...")
|
543 |
+
return []
|
544 |
+
except Exception as e:
|
545 |
+
logging.error(f"Error validating questions: {str(e)}")
|
546 |
+
return []
|
547 |
+
|
548 |
+
except Exception as e:
|
549 |
+
logging.error(f"Error processing chunk {idx + 1}: {str(e)}")
|
550 |
+
safe_st_error(f"Error generating questions for chunk {idx + 1}: {str(e)}")
|
551 |
+
return []
|
552 |
+
|
553 |
+
def generate_questions(text_chunks: List[str], exam_type: str) -> List[Dict[str, Any]]:
|
554 |
+
"""
|
555 |
+
Generates questions for each text chunk using DeepSeek and returns structured JSON.
|
556 |
+
Uses multithreading to process chunks concurrently.
|
557 |
+
"""
|
558 |
+
questions = []
|
559 |
+
structure = domain_structures.get(exam_type, "")
|
560 |
+
|
561 |
+
# Create progress tracking elements in the main thread
|
562 |
+
progress_placeholder = st.empty()
|
563 |
+
status_placeholder = st.empty()
|
564 |
+
metrics_placeholder = st.empty()
|
565 |
+
|
566 |
+
# Process chunks concurrently
|
567 |
+
with ThreadPoolExecutor() as executor:
|
568 |
+
futures = [
|
569 |
+
executor.submit(process_chunk, chunk, exam_type, idx, structure)
|
570 |
+
for idx, chunk in enumerate(text_chunks)
|
571 |
+
]
|
572 |
+
|
573 |
+
completed = 0
|
574 |
+
total = len(text_chunks)
|
575 |
+
total_questions = 0
|
576 |
+
|
577 |
+
# Process results as they complete
|
578 |
+
for future in as_completed(futures):
|
579 |
+
try:
|
580 |
+
chunk_questions = future.result()
|
581 |
+
questions.extend(chunk_questions)
|
582 |
+
total_questions += len(chunk_questions)
|
583 |
+
|
584 |
+
# Update progress in the main thread
|
585 |
+
completed += 1
|
586 |
+
progress = completed / total
|
587 |
+
|
588 |
+
# Update UI elements
|
589 |
+
progress_placeholder.progress(progress)
|
590 |
+
status_placeholder.text(f"Processing chunks: {completed}/{total}")
|
591 |
+
metrics_placeholder.metric(
|
592 |
+
label="Progress",
|
593 |
+
value=f"{completed}/{total} chunks",
|
594 |
+
delta=f"{total_questions} questions generated"
|
595 |
+
)
|
596 |
+
|
597 |
+
# Process any queued messages
|
598 |
+
while not log_queue.empty():
|
599 |
+
msg_type, message = log_queue.get()
|
600 |
+
if msg_type == "warning":
|
601 |
+
st.warning(message)
|
602 |
+
elif msg_type == "error":
|
603 |
+
st.error(message)
|
604 |
+
|
605 |
+
except Exception as e:
|
606 |
+
st.error(f"Error processing chunk: {str(e)}")
|
607 |
+
|
608 |
+
# Show final summary
|
609 |
+
st.success(f"✅ Processing complete! Generated {total_questions} questions from {total} chunks.")
|
610 |
+
|
611 |
+
# Clear progress tracking elements
|
612 |
+
progress_placeholder.empty()
|
613 |
+
status_placeholder.empty()
|
614 |
+
metrics_placeholder.empty()
|
615 |
+
|
616 |
+
return questions
|
617 |
+
|
618 |
+
def upload_questions_to_supabase(generated_questions: List[Dict[str, Any]], source_file: str):
|
619 |
+
"""
|
620 |
+
Uploads generated questions to Supabase.
|
621 |
+
|
622 |
+
Args:
|
623 |
+
generated_questions: List of question dictionaries.
|
624 |
+
source_file: Name of the source PDF file.
|
625 |
+
"""
|
626 |
+
# Create progress tracking for uploads
|
627 |
+
st.write("### Upload Progress")
|
628 |
+
upload_progress = st.progress(0)
|
629 |
+
upload_status = st.empty()
|
630 |
+
|
631 |
+
total = len(generated_questions)
|
632 |
+
successful_uploads = 0
|
633 |
+
failed_uploads = 0
|
634 |
+
|
635 |
+
for idx, question in enumerate(generated_questions):
|
636 |
+
# Generate a new valid UUID regardless of what was provided
|
637 |
+
new_uuid = str(uuid.uuid4())
|
638 |
+
|
639 |
+
# Set default values if not present and match the table schema
|
640 |
+
question_fields = {
|
641 |
+
"id": new_uuid, # Always use our generated UUID
|
642 |
+
"exam_type": question.get("exam_type", "Unknown"),
|
643 |
+
"content_type": question.get("content_type", "Generated"),
|
644 |
+
"exam_section": question.get("exam_section") or question.get("exam_type", "Unknown").lower(),
|
645 |
+
"domain": question.get("domain", "General"),
|
646 |
+
"subdomain": question.get("subdomain", "General"),
|
647 |
+
"topic": question.get("topic", "General"),
|
648 |
+
"difficulty_level": question.get("difficulty_level"),
|
649 |
+
"reading_passage": question.get("reading_passage"),
|
650 |
+
"question_text": question.get("question_text", "Not Available"),
|
651 |
+
"option_a": question.get("option_a"),
|
652 |
+
"option_b": question.get("option_b"),
|
653 |
+
"option_c": question.get("option_c"),
|
654 |
+
"option_d": question.get("option_d"),
|
655 |
+
"correct_answer": question.get("correct_answer", "Not Available"),
|
656 |
+
"explanation": question.get("explanation"),
|
657 |
+
"source_file": source_file,
|
658 |
+
"is_active": question.get("is_active", True),
|
659 |
+
"metadata": json.dumps(question.get("metadata")) if question.get("metadata") else None,
|
660 |
+
"source_text": question.get("source_text")
|
661 |
+
}
|
662 |
+
|
663 |
+
try:
|
664 |
+
# Insert the question and get the response
|
665 |
+
response = supabase.table("exam_contents").insert(question_fields).execute()
|
666 |
+
|
667 |
+
# Check if the response data indicates success
|
668 |
+
if response.data:
|
669 |
+
successful_uploads += 1
|
670 |
+
else:
|
671 |
+
failed_uploads += 1
|
672 |
+
st.warning(f"Failed to insert question: {response.error}")
|
673 |
+
|
674 |
+
except Exception as e:
|
675 |
+
failed_uploads += 1
|
676 |
+
st.error(f"Error uploading question: {str(e)}")
|
677 |
+
|
678 |
+
# Update progress
|
679 |
+
progress = (idx + 1) / total
|
680 |
+
upload_progress.progress(progress)
|
681 |
+
upload_status.text(f"Uploading questions: {idx + 1}/{total} (Success: {successful_uploads}, Failed: {failed_uploads})")
|
682 |
+
|
683 |
+
# Show final upload summary
|
684 |
+
if failed_uploads == 0:
|
685 |
+
st.success(f"✅ Upload complete! Successfully uploaded all {successful_uploads} questions.")
|
686 |
+
else:
|
687 |
+
st.warning(f"⚠️ Upload complete with some issues. Successful: {successful_uploads}, Failed: {failed_uploads}")
|
688 |
+
|
689 |
+
# Clear progress elements
|
690 |
+
upload_progress.empty()
|
691 |
+
upload_status.empty()
|
692 |
+
|
693 |
+
def process_pdfs(pdf_files, exam_type):
|
694 |
+
"""
|
695 |
+
Process multiple PDF files and generate questions.
|
696 |
+
|
697 |
+
Args:
|
698 |
+
pdf_files: List of uploaded PDF files
|
699 |
+
exam_type: Selected exam type
|
700 |
+
|
701 |
+
Returns:
|
702 |
+
Combined questions JSON and download content
|
703 |
+
"""
|
704 |
+
all_questions = []
|
705 |
+
progress_text = st.empty()
|
706 |
+
progress_bar = st.progress(0)
|
707 |
+
|
708 |
+
for i, pdf_file in enumerate(pdf_files):
|
709 |
+
progress_text.text(f"Processing file {i+1}/{len(pdf_files)}: {pdf_file.name}")
|
710 |
+
|
711 |
+
# Convert bytes to file-like object if necessary
|
712 |
+
if isinstance(pdf_file, bytes):
|
713 |
+
pdf_file_obj = BytesIO(pdf_file)
|
714 |
+
else:
|
715 |
+
pdf_file_obj = pdf_file
|
716 |
+
|
717 |
+
# Extract text
|
718 |
+
text_chunks = extract_text_from_pdf(pdf_file_obj)
|
719 |
+
if not text_chunks:
|
720 |
+
st.warning(f"No text extracted from {pdf_file.name}")
|
721 |
+
continue
|
722 |
+
|
723 |
+
# Generate questions
|
724 |
+
file_questions = generate_questions(text_chunks, exam_type)
|
725 |
+
if file_questions:
|
726 |
+
all_questions.extend(file_questions)
|
727 |
+
|
728 |
+
# Upload to Supabase
|
729 |
+
source_file = pdf_file.name
|
730 |
+
upload_questions_to_supabase(file_questions, source_file)
|
731 |
+
|
732 |
+
# Update progress
|
733 |
+
progress_bar.progress((i + 1) / len(pdf_files))
|
734 |
+
|
735 |
+
progress_text.empty()
|
736 |
+
progress_bar.empty()
|
737 |
+
|
738 |
+
if not all_questions:
|
739 |
+
st.warning("No questions were generated from any of the files.")
|
740 |
+
return None, None
|
741 |
+
|
742 |
+
# Prepare JSON output
|
743 |
+
combined_questions_json = json.dumps(all_questions, indent=4)
|
744 |
+
return combined_questions_json, combined_questions_json.encode('utf-8')
|
745 |
+
|
746 |
+
def get_questions(filters=None):
|
747 |
+
"""Fetch questions from Supabase with optional filters."""
|
748 |
+
try:
|
749 |
+
query = supabase.table("exam_contents").select("*")
|
750 |
+
|
751 |
+
if filters:
|
752 |
+
for key, value in filters.items():
|
753 |
+
if value and value != "All":
|
754 |
+
query = query.eq(key, value)
|
755 |
+
|
756 |
+
response = query.execute()
|
757 |
+
return response.data
|
758 |
+
except Exception as e:
|
759 |
+
logging.error(f"Error fetching questions: {e}")
|
760 |
+
return []
|
761 |
+
|
762 |
+
def get_analytics_data(questions):
|
763 |
+
"""Generate analytics data from questions."""
|
764 |
+
df = pd.DataFrame(questions)
|
765 |
+
|
766 |
+
analytics = {
|
767 |
+
'total_questions': len(df),
|
768 |
+
'unfixed_questions': len([q for q in questions if not q.get('is_fixed', False)])
|
769 |
+
}
|
770 |
+
|
771 |
+
# Basic statistics
|
772 |
+
if 'exam_type' in df.columns:
|
773 |
+
analytics['questions_by_exam'] = df['exam_type'].value_counts()
|
774 |
+
else:
|
775 |
+
analytics['questions_by_exam'] = pd.Series(dtype='int64')
|
776 |
+
|
777 |
+
if 'difficulty_level' in df.columns:
|
778 |
+
analytics['questions_by_difficulty'] = df['difficulty_level'].value_counts()
|
779 |
+
else:
|
780 |
+
analytics['questions_by_difficulty'] = pd.Series(dtype='int64')
|
781 |
+
|
782 |
+
if 'domain' in df.columns:
|
783 |
+
analytics['questions_by_domain'] = df['domain'].value_counts()
|
784 |
+
else:
|
785 |
+
analytics['questions_by_domain'] = pd.Series(dtype='int64')
|
786 |
+
|
787 |
+
# Include exam_type in the domain/subdomain grouping
|
788 |
+
if all(col in df.columns for col in ['exam_type', 'domain', 'subdomain']):
|
789 |
+
analytics['questions_by_subdomain'] = df.groupby(['exam_type', 'domain', 'subdomain']).size().reset_index(name='count')
|
790 |
+
else:
|
791 |
+
analytics['questions_by_subdomain'] = pd.DataFrame(columns=['exam_type', 'domain', 'subdomain', 'count'])
|
792 |
+
|
793 |
+
# Time-based analytics
|
794 |
+
if 'created_at' in df.columns:
|
795 |
+
df['created_at'] = pd.to_datetime(df['created_at'])
|
796 |
+
analytics['questions_by_date'] = df.resample('D', on='created_at').size()
|
797 |
+
analytics['questions_by_month'] = df.resample('M', on='created_at').size()
|
798 |
+
analytics['recent_activity'] = df.sort_values('created_at', ascending=False).head(10)
|
799 |
+
|
800 |
+
# Content coverage analysis
|
801 |
+
if 'reading_passage' in df.columns:
|
802 |
+
analytics['has_passage'] = df['reading_passage'].notna().sum()
|
803 |
+
analytics['passage_ratio'] = (df['reading_passage'].notna().sum() / len(df)) * 100 if len(df) > 0 else 0
|
804 |
+
|
805 |
+
# Calculate average passage length
|
806 |
+
df['passage_length'] = df['reading_passage'].str.len().fillna(0)
|
807 |
+
analytics['avg_passage_length'] = df['passage_length'].mean()
|
808 |
+
analytics['passage_length_dist'] = df['passage_length'].describe()
|
809 |
+
|
810 |
+
# Question quality metrics
|
811 |
+
if 'explanation' in df.columns:
|
812 |
+
analytics['has_explanation'] = df['explanation'].notna().sum()
|
813 |
+
analytics['explanation_ratio'] = (df['explanation'].notna().sum() / len(df)) * 100 if len(df) > 0 else 0
|
814 |
+
|
815 |
+
# Calculate explanation comprehensiveness
|
816 |
+
df['explanation_length'] = df['explanation'].str.len().fillna(0)
|
817 |
+
analytics['avg_explanation_length'] = df['explanation_length'].mean()
|
818 |
+
analytics['explanation_length_dist'] = df['explanation_length'].describe()
|
819 |
+
|
820 |
+
# Options analysis
|
821 |
+
option_cols = ['option_a', 'option_b', 'option_c', 'option_d']
|
822 |
+
if all(col in df.columns for col in option_cols):
|
823 |
+
df['options_count'] = df[option_cols].notna().sum(axis=1)
|
824 |
+
analytics['complete_options'] = (df['options_count'] == 4).sum()
|
825 |
+
analytics['options_ratio'] = (analytics['complete_options'] / len(df)) * 100 if len(df) > 0 else 0
|
826 |
+
|
827 |
+
# Domain coverage analysis
|
828 |
+
if all(col in df.columns for col in ['exam_type', 'domain', 'subdomain']):
|
829 |
+
domain_coverage = df.groupby(['exam_type', 'domain'])['subdomain'].nunique().reset_index()
|
830 |
+
domain_coverage.columns = ['exam_type', 'domain', 'unique_subdomains']
|
831 |
+
analytics['domain_coverage'] = domain_coverage
|
832 |
+
|
833 |
+
# Calculate domain balance score (0-100) per exam type
|
834 |
+
domain_balance_scores = []
|
835 |
+
for exam_type in df['exam_type'].unique():
|
836 |
+
exam_domain_counts = df[df['exam_type'] == exam_type]['domain'].value_counts()
|
837 |
+
if not exam_domain_counts.empty:
|
838 |
+
max_count = exam_domain_counts.max()
|
839 |
+
min_count = exam_domain_counts.min()
|
840 |
+
score = ((1 - (max_count - min_count) / max_count) * 100) if max_count > 0 else 100
|
841 |
+
domain_balance_scores.append({'exam_type': exam_type, 'balance_score': score})
|
842 |
+
|
843 |
+
analytics['domain_balance_by_exam'] = pd.DataFrame(domain_balance_scores)
|
844 |
+
analytics['domain_balance_score'] = analytics['domain_balance_by_exam']['balance_score'].mean()
|
845 |
+
|
846 |
+
return analytics
|
847 |
+
|
848 |
+
def rewrite_question(question: Dict[str, Any]) -> Dict[str, Any]:
|
849 |
+
"""
|
850 |
+
Use LLM to rewrite the question, passage, and options while maintaining the same concept.
|
851 |
+
"""
|
852 |
+
prompt = f"""Rewrite the following exam question with a new passage and options. Keep the same concept, difficulty level, and correct answer position, but create fresh content.
|
853 |
+
|
854 |
+
Current Question:
|
855 |
+
Reading Passage: {question.get('reading_passage', '')}
|
856 |
+
Question: {question.get('question_text', '')}
|
857 |
+
Options:
|
858 |
+
A) {question.get('option_a', '')}
|
859 |
+
B) {question.get('option_b', '')}
|
860 |
+
C) {question.get('option_c', '')}
|
861 |
+
D) {question.get('option_d', '')}
|
862 |
+
Correct Answer: {question.get('correct_answer', '')}
|
863 |
+
Explanation: {question.get('explanation', '')}
|
864 |
+
|
865 |
+
IMPORTANT LENGTH REQUIREMENTS:
|
866 |
+
- Reading passage must be AT LEAST 100 characters (preferably 200-300)
|
867 |
+
- Question text must be AT LEAST 50 characters
|
868 |
+
- Options can be concise but clear (no minimum length)
|
869 |
+
- Explanation must be AT LEAST 50 characters
|
870 |
+
|
871 |
+
Requirements:
|
872 |
+
1. Create a new reading passage that:
|
873 |
+
- Must be AT LEAST 100 characters (preferably 200-300)
|
874 |
+
- Covers the same concepts in detail
|
875 |
+
- Maintains similar complexity
|
876 |
+
- Uses rich context and examples
|
877 |
+
|
878 |
+
2. Write a detailed question that:
|
879 |
+
- Must be AT LEAST 50 characters
|
880 |
+
- Clearly states what is being asked
|
881 |
+
- Includes necessary context
|
882 |
+
|
883 |
+
3. Create clear options that:
|
884 |
+
- Are concise but clear
|
885 |
+
- Are distinct from each other
|
886 |
+
- Follow a similar format
|
887 |
+
- Maintain the correct answer in the same position
|
888 |
+
|
889 |
+
4. Write a good explanation that:
|
890 |
+
- Must be AT LEAST 50 characters
|
891 |
+
- Explains the correct answer
|
892 |
+
- Provides clear reasoning
|
893 |
+
- References the passage when relevant
|
894 |
+
|
895 |
+
Return ONLY a JSON object with the following structure:
|
896 |
+
{{
|
897 |
+
"reading_passage": "new_passage (MINIMUM 100 characters)",
|
898 |
+
"question_text": "new_question (MINIMUM 50 characters)",
|
899 |
+
"option_a": "new_option_a (concise)",
|
900 |
+
"option_b": "new_option_b (concise)",
|
901 |
+
"option_c": "new_option_c (concise)",
|
902 |
+
"option_d": "new_option_d (concise)",
|
903 |
+
"explanation": "new_explanation (MINIMUM 50 characters)"
|
904 |
+
}}"""
|
905 |
+
|
906 |
+
try:
|
907 |
+
response = client.chat.completions.create(
|
908 |
+
model=DEPLOYMENT_NAME,
|
909 |
+
messages=[
|
910 |
+
{
|
911 |
+
"role": "system",
|
912 |
+
"content": "You are an expert at rewriting exam questions. Create a detailed reading passage (100+ chars) and clear question (50+ chars). Options should be concise but clear. Explanation should be thorough (50+ chars)."
|
913 |
+
},
|
914 |
+
{"role": "user", "content": prompt}
|
915 |
+
],
|
916 |
+
response_format={"type": "json_object"},
|
917 |
+
temperature=0.7,
|
918 |
+
)
|
919 |
+
|
920 |
+
# Parse the response
|
921 |
+
new_content = json.loads(response.choices[0].message.content)
|
922 |
+
|
923 |
+
# Validate minimum length requirements with detailed error messages
|
924 |
+
length_requirements = {
|
925 |
+
'reading_passage': 100,
|
926 |
+
'question_text': 50,
|
927 |
+
'explanation': 50
|
928 |
+
}
|
929 |
+
|
930 |
+
errors = []
|
931 |
+
for key, min_length in length_requirements.items():
|
932 |
+
value = new_content.get(key, '')
|
933 |
+
current_length = len(value)
|
934 |
+
if current_length < min_length:
|
935 |
+
errors.append(f"{key} is too short: {current_length} chars (minimum {min_length} required)")
|
936 |
+
|
937 |
+
if errors:
|
938 |
+
error_message = "\n".join(errors)
|
939 |
+
raise ValueError(f"Content length requirements not met:\n{error_message}")
|
940 |
+
|
941 |
+
# Update the question with new content while preserving other fields
|
942 |
+
updated_question = question.copy()
|
943 |
+
updated_question.update(new_content)
|
944 |
+
|
945 |
+
return updated_question
|
946 |
+
|
947 |
+
except json.JSONDecodeError as je:
|
948 |
+
error_msg = f"Invalid JSON response from LLM: {str(je)}"
|
949 |
+
logging.error(error_msg)
|
950 |
+
raise ValueError(error_msg)
|
951 |
+
except Exception as e:
|
952 |
+
logging.error(f"Error rewriting question: {str(e)}")
|
953 |
+
raise e
|
954 |
+
|
955 |
+
def display_question(question, index):
|
956 |
+
"""Display a single question with its details."""
|
957 |
+
with st.expander(f"Question {index + 1}", expanded=index == 0):
|
958 |
+
# Add delete and rewrite buttons in the top right corner
|
959 |
+
col1, col2, col3 = st.columns([5, 1, 1])
|
960 |
+
with col2:
|
961 |
+
if st.button("🔄 Rewrite", key=f"rewrite_{question['id']}", type="primary"):
|
962 |
+
try:
|
963 |
+
with st.spinner("Rewriting question..."):
|
964 |
+
# Rewrite the question
|
965 |
+
updated_question = rewrite_question(question)
|
966 |
+
# Update in Supabase
|
967 |
+
supabase.table("exam_contents").update(updated_question).eq("id", question['id']).execute()
|
968 |
+
st.success("Question rewritten successfully!")
|
969 |
+
# Refresh the page
|
970 |
+
st.rerun()
|
971 |
+
except Exception as e:
|
972 |
+
st.error(f"Error rewriting question: {str(e)}")
|
973 |
+
|
974 |
+
with col3:
|
975 |
+
if st.button("🗑️ Delete", key=f"delete_{question['id']}", type="secondary"):
|
976 |
+
try:
|
977 |
+
# Delete from Supabase
|
978 |
+
supabase.table("exam_contents").delete().eq("id", question['id']).execute()
|
979 |
+
st.success("Question deleted successfully!")
|
980 |
+
# Add a rerun to refresh the page
|
981 |
+
st.rerun()
|
982 |
+
except Exception as e:
|
983 |
+
st.error(f"Error deleting question: {str(e)}")
|
984 |
+
|
985 |
+
# Metadata
|
986 |
+
with col1:
|
987 |
+
col_a, col_b, col_c, col_d = st.columns(4)
|
988 |
+
with col_a:
|
989 |
+
st.markdown(f"**Domain:** {question.get('domain', 'N/A')}")
|
990 |
+
with col_b:
|
991 |
+
st.markdown(f"**Subdomain:** {question.get('subdomain', 'N/A')}")
|
992 |
+
with col_c:
|
993 |
+
st.markdown(f"**Topic:** {question.get('topic', 'N/A')}")
|
994 |
+
with col_d:
|
995 |
+
st.markdown(f"**Difficulty:** {question.get('difficulty_level', 'N/A')}")
|
996 |
+
|
997 |
+
# Reading passage if available
|
998 |
+
if question.get('reading_passage'):
|
999 |
+
st.markdown("### 📖 Reading Passage")
|
1000 |
+
st.markdown(
|
1001 |
+
f"""<div style='background-color: #f0f2f6; padding: 20px; border-radius: 10px; margin: 10px 0; color: #1f1f1f;'>
|
1002 |
+
{question['reading_passage']}
|
1003 |
+
</div>""",
|
1004 |
+
unsafe_allow_html=True
|
1005 |
+
)
|
1006 |
+
|
1007 |
+
# Question text and options
|
1008 |
+
st.markdown("### ❓ Question")
|
1009 |
+
st.markdown(f"{question.get('question_text', '')}")
|
1010 |
+
|
1011 |
+
if any(question.get(f'option_{opt}') for opt in ['a', 'b', 'c', 'd']):
|
1012 |
+
st.markdown("### Options")
|
1013 |
+
options_container = st.container()
|
1014 |
+
with options_container:
|
1015 |
+
for opt in ['a', 'b', 'c', 'd']:
|
1016 |
+
if question.get(f'option_{opt}'):
|
1017 |
+
st.markdown(f"**{opt.upper()}.** {question[f'option_{opt}']}")
|
1018 |
+
|
1019 |
+
# Answer and explanation
|
1020 |
+
st.markdown("### Answer & Explanation")
|
1021 |
+
col1, col2 = st.columns(2)
|
1022 |
+
with col1:
|
1023 |
+
st.markdown(
|
1024 |
+
f"""<div style='background-color: #e8f4ea; padding: 10px; border-radius: 5px; margin: 10px 0; color: #1f1f1f;'>
|
1025 |
+
<strong>Correct Answer:</strong> {question.get('correct_answer', 'N/A')}
|
1026 |
+
</div>""",
|
1027 |
+
unsafe_allow_html=True
|
1028 |
+
)
|
1029 |
+
with col2:
|
1030 |
+
if question.get('explanation'):
|
1031 |
+
st.markdown(
|
1032 |
+
f"""<div style='background-color: #fff3e0; padding: 10px; border-radius: 5px; color: #1f1f1f;'>
|
1033 |
+
<strong>Explanation:</strong><br>{question['explanation']}
|
1034 |
+
</div>""",
|
1035 |
+
unsafe_allow_html=True
|
1036 |
+
)
|
1037 |
+
|
1038 |
+
def display_analytics(analytics):
|
1039 |
+
"""Display analytics visualizations."""
|
1040 |
+
st.markdown("""
|
1041 |
+
<h2 style='text-align: center; margin-bottom: 40px;'>📊 Analytics Dashboard</h2>
|
1042 |
+
""", unsafe_allow_html=True)
|
1043 |
+
|
1044 |
+
# Add Fix Button
|
1045 |
+
fix_col1, fix_col2 = st.columns([1, 4])
|
1046 |
+
with fix_col1:
|
1047 |
+
if st.button("🔧 Fix Questions", type="primary"):
|
1048 |
+
with st.spinner("Running fix.py..."):
|
1049 |
+
result = subprocess.run(['python', 'fix.py'], capture_output=True, text=True)
|
1050 |
+
if result.returncode == 0:
|
1051 |
+
st.success("Fix process completed successfully!")
|
1052 |
+
else:
|
1053 |
+
st.error(f"Error running fix.py: {result.stderr}")
|
1054 |
+
|
1055 |
+
with fix_col2:
|
1056 |
+
if analytics.get('unfixed_questions', 0) > 0:
|
1057 |
+
st.warning(f"🔍 {analytics['unfixed_questions']} questions need fixing")
|
1058 |
+
else:
|
1059 |
+
st.success("✅ All questions are fixed")
|
1060 |
+
|
1061 |
+
# Key Metrics Overview
|
1062 |
+
st.markdown("""
|
1063 |
+
<div style='text-align: center; margin-bottom: 30px;'>
|
1064 |
+
<h3 style='color: #0f4c81;'>Key Metrics</h3>
|
1065 |
+
</div>
|
1066 |
+
""", unsafe_allow_html=True)
|
1067 |
+
|
1068 |
+
metrics_container = st.container()
|
1069 |
+
with metrics_container:
|
1070 |
+
col1, col2, col3, col4 = st.columns(4)
|
1071 |
+
with col1:
|
1072 |
+
st.metric("📚 Total Questions", analytics['total_questions'])
|
1073 |
+
with col2:
|
1074 |
+
num_domains = len(analytics['questions_by_domain']) if not analytics['questions_by_domain'].empty else 0
|
1075 |
+
st.metric("🎯 Number of Domains", num_domains)
|
1076 |
+
with col3:
|
1077 |
+
if 'has_passage' in analytics:
|
1078 |
+
passage_ratio = f"{analytics['passage_ratio']:.1f}%"
|
1079 |
+
st.metric("📖 Questions with Passages", passage_ratio)
|
1080 |
+
with col4:
|
1081 |
+
if 'domain_balance_score' in analytics:
|
1082 |
+
balance_score = f"{analytics['domain_balance_score']:.1f}%"
|
1083 |
+
st.metric("⚖️ Domain Balance Score", balance_score)
|
1084 |
+
|
1085 |
+
# Content Quality Metrics
|
1086 |
+
if any(key in analytics for key in ['has_explanation', 'complete_options', 'avg_passage_length']):
|
1087 |
+
st.markdown("""
|
1088 |
+
<div style='text-align: center; margin: 30px 0;'>
|
1089 |
+
<h3 style='color: #0f4c81;'>Content Quality Metrics</h3>
|
1090 |
+
</div>
|
1091 |
+
""", unsafe_allow_html=True)
|
1092 |
+
|
1093 |
+
quality_cols = st.columns(3)
|
1094 |
+
with quality_cols[0]:
|
1095 |
+
if 'explanation_ratio' in analytics:
|
1096 |
+
st.metric("📝 Questions with Explanations",
|
1097 |
+
f"{analytics['explanation_ratio']:.1f}%",
|
1098 |
+
help="Percentage of questions that have explanations")
|
1099 |
+
with quality_cols[1]:
|
1100 |
+
if 'options_ratio' in analytics:
|
1101 |
+
st.metric("✅ Complete Option Sets",
|
1102 |
+
f"{analytics['options_ratio']:.1f}%",
|
1103 |
+
help="Percentage of questions with all 4 options")
|
1104 |
+
with quality_cols[2]:
|
1105 |
+
if 'avg_passage_length' in analytics:
|
1106 |
+
st.metric("📊 Avg Passage Length",
|
1107 |
+
f"{int(analytics['avg_passage_length'])} chars",
|
1108 |
+
help="Average length of reading passages")
|
1109 |
+
|
1110 |
+
# Time-based Analytics
|
1111 |
+
if 'questions_by_date' in analytics and not analytics['questions_by_date'].empty:
|
1112 |
+
st.markdown("""
|
1113 |
+
<div style='text-align: center; margin: 30px 0;'>
|
1114 |
+
<h3 style='color: #0f4c81;'>Question Generation Timeline</h3>
|
1115 |
+
</div>
|
1116 |
+
""", unsafe_allow_html=True)
|
1117 |
+
|
1118 |
+
# Daily question generation trend
|
1119 |
+
fig_timeline = px.line(
|
1120 |
+
x=analytics['questions_by_date'].index,
|
1121 |
+
y=analytics['questions_by_date'].values,
|
1122 |
+
title="Daily Question Generation",
|
1123 |
+
labels={'x': 'Date', 'y': 'Number of Questions'}
|
1124 |
+
)
|
1125 |
+
fig_timeline.update_layout(showlegend=False)
|
1126 |
+
st.plotly_chart(fig_timeline, use_container_width=True)
|
1127 |
+
|
1128 |
+
# Monthly aggregation
|
1129 |
+
if 'questions_by_month' in analytics and not analytics['questions_by_month'].empty:
|
1130 |
+
fig_monthly = px.bar(
|
1131 |
+
x=analytics['questions_by_month'].index,
|
1132 |
+
y=analytics['questions_by_month'].values,
|
1133 |
+
title="Monthly Question Generation",
|
1134 |
+
labels={'x': 'Month', 'y': 'Number of Questions'}
|
1135 |
+
)
|
1136 |
+
fig_monthly.update_layout(showlegend=False)
|
1137 |
+
st.plotly_chart(fig_monthly, use_container_width=True)
|
1138 |
+
|
1139 |
+
# Questions by Exam Type
|
1140 |
+
if not analytics['questions_by_exam'].empty:
|
1141 |
+
st.markdown("""
|
1142 |
+
<div style='text-align: center; margin: 30px 0;'>
|
1143 |
+
<h3 style='color: #0f4c81;'>Distribution by Exam Type</h3>
|
1144 |
+
</div>
|
1145 |
+
""", unsafe_allow_html=True)
|
1146 |
+
|
1147 |
+
col1, col2, col3 = st.columns([1,3,1])
|
1148 |
+
with col2:
|
1149 |
+
fig = px.pie(
|
1150 |
+
values=analytics['questions_by_exam'].values,
|
1151 |
+
names=analytics['questions_by_exam'].index,
|
1152 |
+
hole=0.4,
|
1153 |
+
color_discrete_sequence=px.colors.qualitative.Set3
|
1154 |
+
)
|
1155 |
+
fig.update_layout(
|
1156 |
+
showlegend=True,
|
1157 |
+
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="center", x=0.5),
|
1158 |
+
margin=dict(t=60, b=40, l=40, r=40)
|
1159 |
+
)
|
1160 |
+
st.plotly_chart(fig, use_container_width=True)
|
1161 |
+
|
1162 |
+
# Questions by Difficulty
|
1163 |
+
if not analytics['questions_by_difficulty'].empty:
|
1164 |
+
st.markdown("""
|
1165 |
+
<div style='text-align: center; margin: 30px 0;'>
|
1166 |
+
<h3 style='color: #0f4c81;'>Distribution by Difficulty Level</h3>
|
1167 |
+
</div>
|
1168 |
+
""", unsafe_allow_html=True)
|
1169 |
+
|
1170 |
+
col1, col2, col3 = st.columns([1,3,1])
|
1171 |
+
with col2:
|
1172 |
+
fig = px.bar(
|
1173 |
+
x=analytics['questions_by_difficulty'].index,
|
1174 |
+
y=analytics['questions_by_difficulty'].values,
|
1175 |
+
color=analytics['questions_by_difficulty'].index,
|
1176 |
+
color_discrete_sequence=px.colors.qualitative.Set2
|
1177 |
+
)
|
1178 |
+
fig.update_layout(
|
1179 |
+
showlegend=False,
|
1180 |
+
xaxis_title="Difficulty Level",
|
1181 |
+
yaxis_title="Number of Questions",
|
1182 |
+
margin=dict(t=40, b=40, l=40, r=40)
|
1183 |
+
)
|
1184 |
+
st.plotly_chart(fig, use_container_width=True)
|
1185 |
+
|
1186 |
+
# Domain Coverage Analysis
|
1187 |
+
if 'domain_coverage' in analytics and not analytics['domain_coverage'].empty:
|
1188 |
+
st.markdown("""
|
1189 |
+
<div style='text-align: center; margin: 30px 0;'>
|
1190 |
+
<h3 style='color: #0f4c81;'>Domain Coverage Analysis</h3>
|
1191 |
+
</div>
|
1192 |
+
""", unsafe_allow_html=True)
|
1193 |
+
|
1194 |
+
# Domain coverage heatmap
|
1195 |
+
fig_coverage = px.bar(
|
1196 |
+
analytics['domain_coverage'],
|
1197 |
+
x='domain',
|
1198 |
+
y='unique_subdomains',
|
1199 |
+
title="Number of Unique Subdomains per Domain",
|
1200 |
+
color='unique_subdomains',
|
1201 |
+
color_continuous_scale='Viridis'
|
1202 |
+
)
|
1203 |
+
fig_coverage.update_layout(
|
1204 |
+
xaxis_title="Domain",
|
1205 |
+
yaxis_title="Number of Unique Subdomains",
|
1206 |
+
showlegend=False
|
1207 |
+
)
|
1208 |
+
st.plotly_chart(fig_coverage, use_container_width=True)
|
1209 |
+
|
1210 |
+
# Questions by Domain and Subdomain
|
1211 |
+
if not analytics['questions_by_subdomain'].empty and len(analytics['questions_by_subdomain']) > 0:
|
1212 |
+
st.markdown("""
|
1213 |
+
<div style='text-align: center; margin: 30px 0;'>
|
1214 |
+
<h3 style='color: #0f4c81;'>Distribution by Domain and Subdomain</h3>
|
1215 |
+
</div>
|
1216 |
+
""", unsafe_allow_html=True)
|
1217 |
+
|
1218 |
+
fig = px.treemap(
|
1219 |
+
analytics['questions_by_subdomain'],
|
1220 |
+
path=['exam_type', 'domain', 'subdomain'],
|
1221 |
+
values='count',
|
1222 |
+
color='count',
|
1223 |
+
color_continuous_scale='Viridis'
|
1224 |
+
)
|
1225 |
+
fig.update_layout(margin=dict(t=30, b=30, l=30, r=30))
|
1226 |
+
fig.update_traces(textinfo="label+value")
|
1227 |
+
st.plotly_chart(fig, use_container_width=True)
|
1228 |
+
|
1229 |
+
# Recent Activity
|
1230 |
+
if 'recent_activity' in analytics and not analytics['recent_activity'].empty:
|
1231 |
+
st.markdown("""
|
1232 |
+
<div style='text-align: center; margin: 30px 0;'>
|
1233 |
+
<h3 style='color: #0f4c81;'>Recent Activity</h3>
|
1234 |
+
</div>
|
1235 |
+
""", unsafe_allow_html=True)
|
1236 |
+
|
1237 |
+
recent_df = analytics['recent_activity']
|
1238 |
+
st.dataframe(
|
1239 |
+
recent_df[['exam_type', 'domain', 'subdomain', 'difficulty_level', 'created_at']],
|
1240 |
+
hide_index=True,
|
1241 |
+
column_config={
|
1242 |
+
'created_at': 'Timestamp',
|
1243 |
+
'exam_type': 'Exam Type',
|
1244 |
+
'domain': 'Domain',
|
1245 |
+
'subdomain': 'Subdomain',
|
1246 |
+
'difficulty_level': 'Difficulty'
|
1247 |
+
}
|
1248 |
+
)
|
1249 |
+
|
1250 |
+
# Add some spacing at the bottom
|
1251 |
+
st.markdown("<br><br>", unsafe_allow_html=True)
|
1252 |
+
|
1253 |
+
def get_unique_domains():
|
1254 |
+
"""Get unique domains from the database."""
|
1255 |
+
domains = {
|
1256 |
+
"SAT": ["Mathematics", "Reading and Writing"],
|
1257 |
+
"IELTS": ["Reading", "Writing", "Speaking", "Listening"],
|
1258 |
+
"TOEFL": ["Reading", "Listening", "Speaking", "Writing"]
|
1259 |
+
}
|
1260 |
+
return domains
|
1261 |
+
|
1262 |
+
def get_subdomains_for_domain(exam_type, domain):
|
1263 |
+
"""Get subdomains for a specific domain."""
|
1264 |
+
subdomains = {
|
1265 |
+
"SAT": {
|
1266 |
+
"Mathematics": [
|
1267 |
+
"Algebra",
|
1268 |
+
"Advanced Mathematics",
|
1269 |
+
"Problem Solving and Data Analysis",
|
1270 |
+
"Geometry and Trigonometry"
|
1271 |
+
],
|
1272 |
+
"Reading and Writing": [
|
1273 |
+
"Information and Ideas",
|
1274 |
+
"Craft and Structure"
|
1275 |
+
]
|
1276 |
+
},
|
1277 |
+
"IELTS": {
|
1278 |
+
"Reading": [
|
1279 |
+
"Information Location",
|
1280 |
+
"Critical Analysis",
|
1281 |
+
"Vocabulary and Reference"
|
1282 |
+
],
|
1283 |
+
"Writing": [
|
1284 |
+
"Task Analysis",
|
1285 |
+
"Essay Development",
|
1286 |
+
"Language Control"
|
1287 |
+
],
|
1288 |
+
"Speaking": [
|
1289 |
+
"Personal Expression",
|
1290 |
+
"Topic Development",
|
1291 |
+
"Communication Skills"
|
1292 |
+
],
|
1293 |
+
"Listening": [
|
1294 |
+
"Academic Understanding",
|
1295 |
+
"Pragmatic Understanding",
|
1296 |
+
"Connecting Information"
|
1297 |
+
]
|
1298 |
+
},
|
1299 |
+
"TOEFL": {
|
1300 |
+
"Reading": [
|
1301 |
+
"Comprehension",
|
1302 |
+
"Analysis",
|
1303 |
+
"Academic Skills"
|
1304 |
+
],
|
1305 |
+
"Listening": [
|
1306 |
+
"Academic Understanding",
|
1307 |
+
"Pragmatic Understanding",
|
1308 |
+
"Connecting Information"
|
1309 |
+
],
|
1310 |
+
"Speaking": [
|
1311 |
+
"Independent Tasks",
|
1312 |
+
"Integrated Tasks",
|
1313 |
+
"Delivery Skills"
|
1314 |
+
],
|
1315 |
+
"Writing": [
|
1316 |
+
"Independent Writing",
|
1317 |
+
"Integrated Writing",
|
1318 |
+
"Language Control"
|
1319 |
+
]
|
1320 |
+
}
|
1321 |
+
}
|
1322 |
+
return subdomains.get(exam_type, {}).get(domain, [])
|
1323 |
+
|
1324 |
+
def get_topics_for_subdomain(exam_type, domain, subdomain):
|
1325 |
+
"""Get topics for a specific subdomain."""
|
1326 |
+
topics = {
|
1327 |
+
"SAT": {
|
1328 |
+
"Reading and Writing": {
|
1329 |
+
"Information and Ideas": [
|
1330 |
+
"Central Ideas and Details",
|
1331 |
+
"Command of Textual Evidence",
|
1332 |
+
"Command of Quantitative Evidence",
|
1333 |
+
"Inferences",
|
1334 |
+
"Words in Context"
|
1335 |
+
],
|
1336 |
+
"Craft and Structure": [
|
1337 |
+
"Text Structure and Purpose",
|
1338 |
+
"Cross-Text Connections",
|
1339 |
+
"Rhetorical Synthesis",
|
1340 |
+
"Boundaries",
|
1341 |
+
"Transitions"
|
1342 |
+
]
|
1343 |
+
},
|
1344 |
+
"Mathematics": {
|
1345 |
+
"Algebra": [
|
1346 |
+
"Linear equations in one variable",
|
1347 |
+
"Linear equations in two variables",
|
1348 |
+
"Linear functions",
|
1349 |
+
"Systems of two linear equations in two variables",
|
1350 |
+
"Linear inequalities in one or two variables"
|
1351 |
+
],
|
1352 |
+
"Advanced Mathematics": [
|
1353 |
+
"Equivalent expressions",
|
1354 |
+
"Nonlinear equations in one variable and systems of equations in two variables",
|
1355 |
+
"Nonlinear functions"
|
1356 |
+
],
|
1357 |
+
"Problem Solving and Data Analysis": [
|
1358 |
+
"Ratios, rates, proportional relationships, and units",
|
1359 |
+
"Percentages",
|
1360 |
+
"One-variable data: distributions and measures of center and spread",
|
1361 |
+
"Two-variable data: models and scatterplots",
|
1362 |
+
"Probability and conditional probability",
|
1363 |
+
"Inference from sample statistics and margin of error",
|
1364 |
+
"Evaluating statistical claims: observational studies and experiments"
|
1365 |
+
],
|
1366 |
+
"Geometry and Trigonometry": [
|
1367 |
+
"Area and volume",
|
1368 |
+
"Lines, angles, and triangles",
|
1369 |
+
"Right triangles and trigonometry",
|
1370 |
+
"Circles"
|
1371 |
+
]
|
1372 |
+
}
|
1373 |
+
}
|
1374 |
+
# Add IELTS and TOEFL topics here if needed
|
1375 |
+
}
|
1376 |
+
return topics.get(exam_type, {}).get(domain, {}).get(subdomain, [])
|
1377 |
+
|
1378 |
+
# Streamlit Interface
|
1379 |
+
st.set_page_config(page_title="📄 PDF to Exam Questions Generator with Supabase Upload", layout="wide")
|
1380 |
+
st.title("📄 PDF to Exam Questions Generator with Supabase Upload")
|
1381 |
+
|
1382 |
+
# Create tabs for different functionalities
|
1383 |
+
tab_upload, tab_view, tab_analytics = st.tabs(["📤 Upload & Generate", "🔍 View Questions", "📊 Analytics"])
|
1384 |
+
|
1385 |
+
with tab_upload:
|
1386 |
+
st.markdown(
|
1387 |
+
"""
|
1388 |
+
Upload PDF files containing exam material, select the exam type, and generate structured questions automatically.
|
1389 |
+
The generated questions will be uploaded to your Supabase database.
|
1390 |
+
|
1391 |
+
**Supported Exam Types**: SAT, IELTS, TOEFL
|
1392 |
+
"""
|
1393 |
+
)
|
1394 |
+
|
1395 |
+
# File uploader and exam type selection
|
1396 |
+
uploaded_files = st.file_uploader("📥 Upload PDFs", type=["pdf"], accept_multiple_files=True)
|
1397 |
+
exam_type = st.selectbox(
|
1398 |
+
"📝 Select Exam Type",
|
1399 |
+
options=["SAT", "IELTS", "TOEFL"],
|
1400 |
+
index=0
|
1401 |
+
)
|
1402 |
+
|
1403 |
+
# Generate and Upload Button
|
1404 |
+
if st.button("🚀 Generate and Upload Questions"):
|
1405 |
+
if not uploaded_files:
|
1406 |
+
st.error("Please upload at least one PDF file.")
|
1407 |
+
else:
|
1408 |
+
with st.spinner("Processing files..."):
|
1409 |
+
questions_json, download_content = process_pdfs(uploaded_files, exam_type)
|
1410 |
+
if questions_json:
|
1411 |
+
st.success(f"Successfully processed {len(uploaded_files)} files and generated questions!")
|
1412 |
+
st.json(json.loads(questions_json))
|
1413 |
+
|
1414 |
+
# Provide download button
|
1415 |
+
st.download_button(
|
1416 |
+
label="⬇️ Download Questions JSON",
|
1417 |
+
data=download_content,
|
1418 |
+
file_name=f"generated_questions_{uuid.uuid4()}.json",
|
1419 |
+
mime="application/json"
|
1420 |
+
)
|
1421 |
+
|
1422 |
+
with tab_view:
|
1423 |
+
st.subheader("Question Browser")
|
1424 |
+
|
1425 |
+
# Initialize session state
|
1426 |
+
if 'selected_domain' not in st.session_state:
|
1427 |
+
st.session_state.selected_domain = "All"
|
1428 |
+
if 'selected_subdomain' not in st.session_state:
|
1429 |
+
st.session_state.selected_subdomain = "All"
|
1430 |
+
if 'selected_topic' not in st.session_state:
|
1431 |
+
st.session_state.selected_topic = "All"
|
1432 |
+
|
1433 |
+
# Filters
|
1434 |
+
col1, col2 = st.columns(2)
|
1435 |
+
with col1:
|
1436 |
+
view_exam_type = st.selectbox("Exam Type", ["All"] + EXAM_TYPES, key="view_exam_type")
|
1437 |
+
|
1438 |
+
# Get domains based on exam type
|
1439 |
+
domains = ["All"]
|
1440 |
+
if view_exam_type != "All":
|
1441 |
+
domains.extend(get_unique_domains().get(view_exam_type, []))
|
1442 |
+
domain = st.selectbox("Domain", domains, key="domain_select")
|
1443 |
+
|
1444 |
+
# Reset subdomain when domain changes
|
1445 |
+
if domain != st.session_state.get('last_domain'):
|
1446 |
+
st.session_state.selected_subdomain = "All"
|
1447 |
+
st.session_state.last_domain = domain
|
1448 |
+
st.session_state.selected_topic = "All"
|
1449 |
+
|
1450 |
+
with col2:
|
1451 |
+
difficulty = st.selectbox("Difficulty Level", ["All"] + DIFFICULTY_LEVELS)
|
1452 |
+
|
1453 |
+
# Get subdomains based on selected exam type and domain
|
1454 |
+
subdomains = ["All"]
|
1455 |
+
if domain != "All" and view_exam_type != "All":
|
1456 |
+
subdomains.extend(get_subdomains_for_domain(view_exam_type, domain))
|
1457 |
+
subdomain = st.selectbox("Subdomain", subdomains, key="subdomain_select")
|
1458 |
+
|
1459 |
+
# Get topics based on selected exam type, domain, and subdomain
|
1460 |
+
topics = ["All"]
|
1461 |
+
if subdomain != "All" and domain != "All" and view_exam_type != "All":
|
1462 |
+
topics.extend(get_topics_for_subdomain(view_exam_type, domain, subdomain))
|
1463 |
+
topic = st.selectbox("Topic", topics, key="topic_select")
|
1464 |
+
|
1465 |
+
# Apply filters
|
1466 |
+
filters = {
|
1467 |
+
'exam_type': view_exam_type if view_exam_type != "All" else None,
|
1468 |
+
'difficulty_level': difficulty if difficulty != "All" else None,
|
1469 |
+
'domain': domain if domain != "All" else None,
|
1470 |
+
'subdomain': subdomain if subdomain != "All" else None,
|
1471 |
+
'topic': topic if topic != "All" else None
|
1472 |
+
}
|
1473 |
+
|
1474 |
+
# Remove None values from filters
|
1475 |
+
filters = {k: v for k, v in filters.items() if v is not None}
|
1476 |
+
|
1477 |
+
# Get filtered questions
|
1478 |
+
questions = get_questions(filters)
|
1479 |
+
|
1480 |
+
if not questions:
|
1481 |
+
st.info("No questions found matching the selected filters.")
|
1482 |
+
else:
|
1483 |
+
st.success(f"Found {len(questions)} questions")
|
1484 |
+
|
1485 |
+
# Display questions
|
1486 |
+
for i, question in enumerate(questions):
|
1487 |
+
display_question(question, i)
|
1488 |
+
|
1489 |
+
with tab_analytics:
|
1490 |
+
# Get all questions for analytics
|
1491 |
+
all_questions = get_questions()
|
1492 |
+
analytics = get_analytics_data(all_questions)
|
1493 |
+
display_analytics(analytics)
|
1494 |
+
|
1495 |
+
st.markdown(
|
1496 |
+
"""
|
1497 |
+
---
|
1498 |
+
**Note**: This application uses Azure OpenAI services to generate exam questions and uploads them to Supabase. Ensure that your API credentials are correctly set in the environment variables.
|
1499 |
+
"""
|
1500 |
+
)
|
fix.py
ADDED
@@ -0,0 +1,349 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# fix.py
|
2 |
+
|
3 |
+
import os
|
4 |
+
import json
|
5 |
+
import logging
|
6 |
+
import re
|
7 |
+
from typing import Dict, Any, Optional
|
8 |
+
from io import BytesIO
|
9 |
+
import concurrent.futures
|
10 |
+
from threading import Lock
|
11 |
+
import queue
|
12 |
+
|
13 |
+
import openai
|
14 |
+
from supabase import create_client, Client
|
15 |
+
from dotenv import load_dotenv
|
16 |
+
from tqdm import tqdm # For progress bar
|
17 |
+
from openai import AzureOpenAI
|
18 |
+
|
19 |
+
# Set up logging with thread safety
|
20 |
+
logging.basicConfig(
|
21 |
+
level=logging.INFO,
|
22 |
+
format='%(asctime)s - %(threadName)s - %(levelname)s - %(message)s',
|
23 |
+
handlers=[
|
24 |
+
logging.FileHandler('fix.log'),
|
25 |
+
logging.StreamHandler()
|
26 |
+
]
|
27 |
+
)
|
28 |
+
|
29 |
+
# Load environment variables from .env file (if present)
|
30 |
+
load_dotenv()
|
31 |
+
|
32 |
+
# Constants
|
33 |
+
MIN_PASSAGE_WORDS = 100 # Minimum number of words for reading_passage
|
34 |
+
VALID_CORRECT_ANSWERS = {'A', 'B', 'C', 'D'}
|
35 |
+
EXAM_TYPES = ["SAT", "IELTS", "TOEFL"]
|
36 |
+
|
37 |
+
# Load environment variables
|
38 |
+
SUPABASE_URL = os.getenv("SUPABASE_DB_URL")
|
39 |
+
SUPABASE_API_KEY = os.getenv("SUPABASE_API_KEY")
|
40 |
+
AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY")
|
41 |
+
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
|
42 |
+
AZURE_OPENAI_DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")
|
43 |
+
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "2023-05-15")
|
44 |
+
|
45 |
+
# Validate environment variables
|
46 |
+
missing_vars = []
|
47 |
+
if not SUPABASE_URL:
|
48 |
+
missing_vars.append("SUPABASE_DB_URL")
|
49 |
+
if not SUPABASE_API_KEY:
|
50 |
+
missing_vars.append("SUPABASE_API_KEY")
|
51 |
+
if not AZURE_OPENAI_KEY:
|
52 |
+
missing_vars.append("AZURE_OPENAI_KEY")
|
53 |
+
if not AZURE_OPENAI_ENDPOINT:
|
54 |
+
missing_vars.append("AZURE_OPENAI_ENDPOINT")
|
55 |
+
if not AZURE_OPENAI_DEPLOYMENT_NAME:
|
56 |
+
missing_vars.append("AZURE_OPENAI_DEPLOYMENT_NAME")
|
57 |
+
|
58 |
+
if missing_vars:
|
59 |
+
logging.error(f"Missing environment variables: {', '.join(missing_vars)}")
|
60 |
+
raise EnvironmentError(f"Missing environment variables: {', '.join(missing_vars)}")
|
61 |
+
|
62 |
+
# Initialize Supabase client
|
63 |
+
supabase: Client = create_client(SUPABASE_URL, SUPABASE_API_KEY)
|
64 |
+
logging.info("Connected to Supabase successfully.")
|
65 |
+
|
66 |
+
# Initialize OpenAI for Azure
|
67 |
+
openai.api_type = "azure"
|
68 |
+
openai.api_key = AZURE_OPENAI_KEY
|
69 |
+
openai.api_base = AZURE_OPENAI_ENDPOINT
|
70 |
+
openai.api_version = AZURE_OPENAI_API_VERSION
|
71 |
+
|
72 |
+
# Set up Azure OpenAI client
|
73 |
+
API_KEY = os.getenv("AZURE_OPENAI_KEY")
|
74 |
+
ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
|
75 |
+
DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")
|
76 |
+
|
77 |
+
if not API_KEY or not ENDPOINT or not DEPLOYMENT_NAME:
|
78 |
+
raise ValueError("Azure OpenAI configuration is incomplete.")
|
79 |
+
|
80 |
+
client = AzureOpenAI(
|
81 |
+
api_key=API_KEY,
|
82 |
+
api_version="2024-02-15-preview",
|
83 |
+
azure_endpoint=ENDPOINT
|
84 |
+
)
|
85 |
+
|
86 |
+
# Thread-safe counter for progress tracking
|
87 |
+
class AtomicCounter:
|
88 |
+
def __init__(self, initial=0):
|
89 |
+
self._value = initial
|
90 |
+
self._lock = Lock()
|
91 |
+
|
92 |
+
def increment(self):
|
93 |
+
with self._lock:
|
94 |
+
self._value += 1
|
95 |
+
return self._value
|
96 |
+
|
97 |
+
def value(self):
|
98 |
+
with self._lock:
|
99 |
+
return self._value
|
100 |
+
|
101 |
+
def word_count(text: str) -> int:
|
102 |
+
"""Returns the number of words in a given text."""
|
103 |
+
return len(text.split())
|
104 |
+
|
105 |
+
def is_valid_correct_answer(answer: str) -> bool:
|
106 |
+
"""Checks if the correct_answer is one of A, B, C, D."""
|
107 |
+
return answer.upper() in VALID_CORRECT_ANSWERS
|
108 |
+
|
109 |
+
def clean_text(text: str) -> str:
|
110 |
+
"""Cleans the text by removing unwanted characters and extra spaces."""
|
111 |
+
text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with single space
|
112 |
+
text = text.strip()
|
113 |
+
return text
|
114 |
+
|
115 |
+
def check_row_quality(row: Dict[str, Any]) -> bool:
|
116 |
+
"""
|
117 |
+
Checks if the row has good quality data according to exam standards.
|
118 |
+
Returns True if the row is good, False if it needs fixing.
|
119 |
+
"""
|
120 |
+
# Skip if already fixed
|
121 |
+
if row.get('is_fixed'):
|
122 |
+
return True
|
123 |
+
|
124 |
+
required_fields = [
|
125 |
+
'exam_type', 'content_type', 'exam_section', 'domain', 'subdomain',
|
126 |
+
'topic', 'difficulty_level', 'reading_passage', 'question_text',
|
127 |
+
'option_a', 'option_b', 'option_c', 'option_d', 'correct_answer',
|
128 |
+
'explanation'
|
129 |
+
]
|
130 |
+
|
131 |
+
# Check for missing or empty required fields
|
132 |
+
for field in required_fields:
|
133 |
+
if not row.get(field):
|
134 |
+
return False
|
135 |
+
|
136 |
+
# Check for OCR artifacts in text fields
|
137 |
+
text_fields = ['reading_passage', 'question_text', 'option_a', 'option_b', 'option_c', 'option_d', 'explanation']
|
138 |
+
for field in text_fields:
|
139 |
+
text = row.get(field, '')
|
140 |
+
if isinstance(text, str):
|
141 |
+
if 'arebasedonthe' in text or text.count('.') > 20 or 'Line' in text:
|
142 |
+
return False
|
143 |
+
|
144 |
+
return True
|
145 |
+
|
146 |
+
def generate_fixed_content(row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
147 |
+
"""
|
148 |
+
Uses Azure OpenAI to generate fixed content for a row.
|
149 |
+
Returns a dictionary with fixed fields or None if failed.
|
150 |
+
"""
|
151 |
+
prompt = f"""Fix and improve the following exam question. Clean up any OCR artifacts, fix formatting issues, and ensure high quality.
|
152 |
+
|
153 |
+
Current Question:
|
154 |
+
Reading Passage: {row.get('reading_passage', '')}
|
155 |
+
Question: {row.get('question_text', '')}
|
156 |
+
Options:
|
157 |
+
A) {row.get('option_a', '')}
|
158 |
+
B) {row.get('option_b', '')}
|
159 |
+
C) {row.get('option_c', '')}
|
160 |
+
D) {row.get('option_d', '')}
|
161 |
+
Correct Answer: {row.get('correct_answer', '')}
|
162 |
+
Explanation: {row.get('explanation', '')}
|
163 |
+
|
164 |
+
Requirements:
|
165 |
+
1. Clean up any OCR artifacts and formatting issues
|
166 |
+
2. Maintain the same meaning and difficulty level
|
167 |
+
3. Keep the same correct answer
|
168 |
+
4. Ensure the explanation clearly justifies the answer
|
169 |
+
5. Make sure all text is properly formatted and readable
|
170 |
+
6. Preserve all important content and details
|
171 |
+
7. Fix any spacing or punctuation issues
|
172 |
+
|
173 |
+
Return a JSON object with the following fields:
|
174 |
+
{{
|
175 |
+
"reading_passage": "cleaned passage",
|
176 |
+
"question_text": "cleaned question",
|
177 |
+
"option_a": "cleaned option A",
|
178 |
+
"option_b": "cleaned option B",
|
179 |
+
"option_c": "cleaned option C",
|
180 |
+
"option_d": "cleaned option D",
|
181 |
+
"explanation": "cleaned explanation"
|
182 |
+
}}"""
|
183 |
+
|
184 |
+
try:
|
185 |
+
response = client.chat.completions.create(
|
186 |
+
model=DEPLOYMENT_NAME,
|
187 |
+
messages=[
|
188 |
+
{
|
189 |
+
"role": "system",
|
190 |
+
"content": "You are an expert at fixing and improving exam questions. Clean up formatting while preserving meaning."
|
191 |
+
},
|
192 |
+
{"role": "user", "content": prompt}
|
193 |
+
],
|
194 |
+
response_format={"type": "json_object"},
|
195 |
+
temperature=0.0
|
196 |
+
)
|
197 |
+
|
198 |
+
fixed_content = json.loads(response.choices[0].message.content)
|
199 |
+
|
200 |
+
# Preserve original fields and update only the fixed ones
|
201 |
+
updated_data = row.copy()
|
202 |
+
updated_data.update(fixed_content)
|
203 |
+
updated_data['is_fixed'] = True
|
204 |
+
|
205 |
+
return updated_data
|
206 |
+
|
207 |
+
except Exception as e:
|
208 |
+
logging.error(f"Error generating fixed content: {str(e)}")
|
209 |
+
return None
|
210 |
+
|
211 |
+
def extract_json(text: str) -> Optional[str]:
|
212 |
+
"""
|
213 |
+
Extracts JSON object from a block of text.
|
214 |
+
Returns the JSON string or None if not found.
|
215 |
+
"""
|
216 |
+
try:
|
217 |
+
# Find the first { and the last }
|
218 |
+
start = text.find('{')
|
219 |
+
end = text.rfind('}')
|
220 |
+
if start == -1 or end == -1:
|
221 |
+
return None
|
222 |
+
json_str = text[start:end+1]
|
223 |
+
# Validate JSON
|
224 |
+
json.loads(json_str)
|
225 |
+
return json_str
|
226 |
+
except json.JSONDecodeError:
|
227 |
+
return None
|
228 |
+
|
229 |
+
def update_row_in_supabase(row_id: str, fixed_data: Dict[str, Any]) -> bool:
|
230 |
+
"""
|
231 |
+
Updates a row in Supabase with fixed data.
|
232 |
+
Returns True if successful, False otherwise.
|
233 |
+
"""
|
234 |
+
try:
|
235 |
+
response = supabase.table("exam_contents").update(fixed_data).eq("id", row_id).execute()
|
236 |
+
|
237 |
+
# Check if data exists in the response
|
238 |
+
if response.data:
|
239 |
+
logging.info(f"Successfully updated row ID {row_id}.")
|
240 |
+
return True
|
241 |
+
else:
|
242 |
+
logging.error(f"Failed to update row ID {row_id}.")
|
243 |
+
return False
|
244 |
+
|
245 |
+
except Exception as e:
|
246 |
+
logging.error(f"Exception while updating row ID {row_id}: {str(e)}")
|
247 |
+
return False
|
248 |
+
|
249 |
+
def process_row(row: Dict[str, Any], progress_counter: AtomicCounter, total_rows: int) -> Dict[str, Any]:
|
250 |
+
"""
|
251 |
+
Process a single row with progress tracking.
|
252 |
+
Returns a dictionary with the results.
|
253 |
+
"""
|
254 |
+
row_id = row.get('id')
|
255 |
+
result = {
|
256 |
+
'row_id': row_id,
|
257 |
+
'success': False,
|
258 |
+
'message': ''
|
259 |
+
}
|
260 |
+
|
261 |
+
try:
|
262 |
+
if not row_id:
|
263 |
+
result['message'] = "Row without ID found"
|
264 |
+
return result
|
265 |
+
|
266 |
+
if check_row_quality(row):
|
267 |
+
success = update_row_in_supabase(row_id, {'is_fixed': True})
|
268 |
+
result['success'] = success
|
269 |
+
result['message'] = "Good quality, marked as fixed"
|
270 |
+
progress_counter.increment()
|
271 |
+
return result
|
272 |
+
|
273 |
+
fixed_data = generate_fixed_content(row)
|
274 |
+
if not fixed_data:
|
275 |
+
result['message'] = "Failed to fix content"
|
276 |
+
progress_counter.increment()
|
277 |
+
return result
|
278 |
+
|
279 |
+
success = update_row_in_supabase(row_id, fixed_data)
|
280 |
+
result['success'] = success
|
281 |
+
result['message'] = "Successfully fixed and updated" if success else "Failed to update"
|
282 |
+
|
283 |
+
except Exception as e:
|
284 |
+
result['message'] = f"Error: {str(e)}"
|
285 |
+
logging.error(f"Error processing row {row_id}: {str(e)}")
|
286 |
+
|
287 |
+
progress_counter.increment()
|
288 |
+
progress = progress_counter.value()
|
289 |
+
if progress % 10 == 0: # Update progress every 10 rows
|
290 |
+
print(f"Progress: {progress}/{total_rows} rows processed")
|
291 |
+
|
292 |
+
return result
|
293 |
+
|
294 |
+
def main():
|
295 |
+
"""
|
296 |
+
Main function to process and fix exam questions in Supabase using multithreading.
|
297 |
+
"""
|
298 |
+
logging.info("Starting fix.py script with multithreading.")
|
299 |
+
|
300 |
+
try:
|
301 |
+
# Fetch only unfixed rows from exam_contents
|
302 |
+
response = supabase.table("exam_contents").select("*").eq("is_fixed", False).execute()
|
303 |
+
rows = response.data
|
304 |
+
total_rows = len(rows)
|
305 |
+
logging.info(f"Fetched {total_rows} unfixed rows from exam_contents.")
|
306 |
+
|
307 |
+
if total_rows == 0:
|
308 |
+
logging.info("No unfixed rows found in exam_contents. Exiting.")
|
309 |
+
print("No unfixed rows found in exam_contents. Exiting.")
|
310 |
+
return
|
311 |
+
|
312 |
+
# Initialize counters
|
313 |
+
progress_counter = AtomicCounter()
|
314 |
+
success_count = 0
|
315 |
+
failure_count = 0
|
316 |
+
|
317 |
+
# Create a thread pool
|
318 |
+
max_workers = min(32, total_rows) # Cap at 32 threads or total rows, whichever is smaller
|
319 |
+
print(f"Starting processing with {max_workers} threads...")
|
320 |
+
|
321 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
322 |
+
# Submit all rows for processing
|
323 |
+
future_to_row = {
|
324 |
+
executor.submit(process_row, row, progress_counter, total_rows): row
|
325 |
+
for row in rows
|
326 |
+
}
|
327 |
+
|
328 |
+
# Process completed futures as they finish
|
329 |
+
for future in concurrent.futures.as_completed(future_to_row):
|
330 |
+
result = future.result()
|
331 |
+
if result['success']:
|
332 |
+
success_count += 1
|
333 |
+
else:
|
334 |
+
failure_count += 1
|
335 |
+
logging.warning(f"Failed to process row {result['row_id']}: {result['message']}")
|
336 |
+
|
337 |
+
# Final statistics
|
338 |
+
logging.info(f"Processing completed. Success: {success_count}, Failures: {failure_count}")
|
339 |
+
print(f"\nProcessing completed:")
|
340 |
+
print(f"Total rows processed: {total_rows}")
|
341 |
+
print(f"Successful updates: {success_count}")
|
342 |
+
print(f"Failed updates: {failure_count}")
|
343 |
+
|
344 |
+
except Exception as e:
|
345 |
+
logging.error(f"An unexpected error occurred: {str(e)}")
|
346 |
+
print(f"An unexpected error occurred: {str(e)}")
|
347 |
+
|
348 |
+
if __name__ == "__main__":
|
349 |
+
main()
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
openai
|
3 |
+
python-dotenv
|
4 |
+
pydantic
|
5 |
+
supabase
|
6 |
+
PyMuPDF
|
7 |
+
plotly
|
8 |
+
pandas
|
9 |
+
tiktoken
|