Spaces:
Running
Running
Update fix.py
Browse files
fix.py
CHANGED
@@ -1,30 +1,71 @@
|
|
1 |
# fix.py
|
2 |
|
3 |
import os
|
|
|
4 |
import json
|
5 |
import logging
|
6 |
-
import re
|
7 |
-
from typing import Dict, Any, Optional
|
8 |
-
from io import BytesIO
|
9 |
import concurrent.futures
|
10 |
-
from
|
11 |
import queue
|
|
|
|
|
|
|
|
|
12 |
|
13 |
-
import
|
14 |
from supabase import create_client, Client
|
|
|
15 |
from dotenv import load_dotenv
|
16 |
-
from
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
# Load environment variables from .env file (if present)
|
30 |
load_dotenv()
|
@@ -33,6 +74,7 @@ load_dotenv()
|
|
33 |
MIN_PASSAGE_WORDS = 100 # Minimum number of words for reading_passage
|
34 |
VALID_CORRECT_ANSWERS = {'A', 'B', 'C', 'D'}
|
35 |
EXAM_TYPES = ["SAT", "IELTS", "TOEFL"]
|
|
|
36 |
|
37 |
# Load environment variables
|
38 |
SUPABASE_URL = os.getenv("SUPABASE_DB_URL")
|
@@ -56,38 +98,25 @@ if not AZURE_OPENAI_DEPLOYMENT_NAME:
|
|
56 |
missing_vars.append("AZURE_OPENAI_DEPLOYMENT_NAME")
|
57 |
|
58 |
if missing_vars:
|
59 |
-
|
60 |
-
|
|
|
61 |
|
62 |
# Initialize Supabase client
|
63 |
supabase: Client = create_client(SUPABASE_URL, SUPABASE_API_KEY)
|
64 |
-
logging.info("Connected to Supabase successfully.")
|
65 |
-
|
66 |
-
# Initialize OpenAI for Azure
|
67 |
-
openai.api_type = "azure"
|
68 |
-
openai.api_key = AZURE_OPENAI_KEY
|
69 |
-
openai.api_base = AZURE_OPENAI_ENDPOINT
|
70 |
-
openai.api_version = AZURE_OPENAI_API_VERSION
|
71 |
-
|
72 |
-
# Set up Azure OpenAI client
|
73 |
-
API_KEY = os.getenv("AZURE_OPENAI_KEY")
|
74 |
-
ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
|
75 |
-
DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")
|
76 |
-
|
77 |
-
if not API_KEY or not ENDPOINT or not DEPLOYMENT_NAME:
|
78 |
-
raise ValueError("Azure OpenAI configuration is incomplete.")
|
79 |
|
|
|
80 |
client = AzureOpenAI(
|
81 |
-
api_key=
|
82 |
-
api_version=
|
83 |
-
azure_endpoint=
|
84 |
)
|
85 |
|
86 |
# Thread-safe counter for progress tracking
|
87 |
class AtomicCounter:
|
88 |
def __init__(self, initial=0):
|
89 |
self._value = initial
|
90 |
-
self._lock = Lock()
|
91 |
|
92 |
def increment(self):
|
93 |
with self._lock:
|
@@ -98,6 +127,133 @@ class AtomicCounter:
|
|
98 |
with self._lock:
|
99 |
return self._value
|
100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
def word_count(text: str) -> int:
|
102 |
"""Returns the number of words in a given text."""
|
103 |
return len(text.split())
|
@@ -120,111 +276,70 @@ def check_row_quality(row: Dict[str, Any]) -> bool:
|
|
120 |
# Skip if already fixed
|
121 |
if row.get('is_fixed'):
|
122 |
return True
|
123 |
-
|
|
|
124 |
required_fields = [
|
125 |
'exam_type', 'content_type', 'exam_section', 'domain', 'subdomain',
|
126 |
'topic', 'difficulty_level', 'reading_passage', 'question_text',
|
127 |
'option_a', 'option_b', 'option_c', 'option_d', 'correct_answer',
|
128 |
'explanation'
|
129 |
]
|
130 |
-
|
131 |
# Check for missing or empty required fields
|
132 |
for field in required_fields:
|
133 |
-
if
|
|
|
134 |
return False
|
135 |
-
|
136 |
-
# Check for
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
text_fields = ['reading_passage', 'question_text', 'option_a', 'option_b', 'option_c', 'option_d', 'explanation']
|
138 |
for field in text_fields:
|
139 |
text = row.get(field, '')
|
140 |
if isinstance(text, str):
|
141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
return False
|
143 |
-
|
144 |
-
return True
|
145 |
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
Returns a dictionary with fixed fields or None if failed.
|
150 |
-
"""
|
151 |
-
prompt = f"""Fix and improve the following exam question. Clean up any OCR artifacts, fix formatting issues, and ensure high quality.
|
152 |
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
A) {row.get('option_a', '')}
|
158 |
-
B) {row.get('option_b', '')}
|
159 |
-
C) {row.get('option_c', '')}
|
160 |
-
D) {row.get('option_d', '')}
|
161 |
-
Correct Answer: {row.get('correct_answer', '')}
|
162 |
-
Explanation: {row.get('explanation', '')}
|
163 |
-
|
164 |
-
Requirements:
|
165 |
-
1. Clean up any OCR artifacts and formatting issues
|
166 |
-
2. Maintain the same meaning and difficulty level
|
167 |
-
3. Keep the same correct answer
|
168 |
-
4. Ensure the explanation clearly justifies the answer
|
169 |
-
5. Make sure all text is properly formatted and readable
|
170 |
-
6. Preserve all important content and details
|
171 |
-
7. Fix any spacing or punctuation issues
|
172 |
-
|
173 |
-
Return a JSON object with the following fields:
|
174 |
-
{{
|
175 |
-
"reading_passage": "cleaned passage",
|
176 |
-
"question_text": "cleaned question",
|
177 |
-
"option_a": "cleaned option A",
|
178 |
-
"option_b": "cleaned option B",
|
179 |
-
"option_c": "cleaned option C",
|
180 |
-
"option_d": "cleaned option D",
|
181 |
-
"explanation": "cleaned explanation"
|
182 |
-
}}"""
|
183 |
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
"content": "You are an expert at fixing and improving exam questions. Clean up formatting while preserving meaning."
|
191 |
-
},
|
192 |
-
{"role": "user", "content": prompt}
|
193 |
-
],
|
194 |
-
response_format={"type": "json_object"},
|
195 |
-
temperature=0.0
|
196 |
-
)
|
197 |
-
|
198 |
-
fixed_content = json.loads(response.choices[0].message.content)
|
199 |
-
|
200 |
-
# Preserve original fields and update only the fixed ones
|
201 |
-
updated_data = row.copy()
|
202 |
-
updated_data.update(fixed_content)
|
203 |
-
updated_data['is_fixed'] = True
|
204 |
-
|
205 |
-
return updated_data
|
206 |
-
|
207 |
-
except Exception as e:
|
208 |
-
logging.error(f"Error generating fixed content: {str(e)}")
|
209 |
-
return None
|
210 |
|
211 |
-
|
212 |
-
"""
|
213 |
-
Extracts JSON object from a block of text.
|
214 |
-
Returns the JSON string or None if not found.
|
215 |
-
"""
|
216 |
-
try:
|
217 |
-
# Find the first { and the last }
|
218 |
-
start = text.find('{')
|
219 |
-
end = text.rfind('}')
|
220 |
-
if start == -1 or end == -1:
|
221 |
-
return None
|
222 |
-
json_str = text[start:end+1]
|
223 |
-
# Validate JSON
|
224 |
-
json.loads(json_str)
|
225 |
-
return json_str
|
226 |
-
except json.JSONDecodeError:
|
227 |
-
return None
|
228 |
|
229 |
def update_row_in_supabase(row_id: str, fixed_data: Dict[str, Any]) -> bool:
|
230 |
"""
|
@@ -233,117 +348,195 @@ def update_row_in_supabase(row_id: str, fixed_data: Dict[str, Any]) -> bool:
|
|
233 |
"""
|
234 |
try:
|
235 |
response = supabase.table("exam_contents").update(fixed_data).eq("id", row_id).execute()
|
236 |
-
|
237 |
# Check if data exists in the response
|
238 |
if response.data:
|
239 |
-
|
|
|
240 |
return True
|
241 |
else:
|
242 |
-
|
243 |
return False
|
244 |
|
245 |
except Exception as e:
|
246 |
-
|
247 |
return False
|
248 |
|
249 |
-
def process_row(row: Dict[str, Any], progress_counter: AtomicCounter, total_rows: int) -> Dict[str, Any]:
|
250 |
-
"""
|
251 |
-
Process a single row with progress tracking.
|
252 |
-
Returns a dictionary with the results.
|
253 |
-
"""
|
254 |
-
row_id = row.get('id')
|
255 |
result = {
|
256 |
-
'row_id':
|
257 |
'success': False,
|
258 |
-
'message': ''
|
|
|
259 |
}
|
260 |
-
|
261 |
try:
|
|
|
262 |
if not row_id:
|
263 |
result['message'] = "Row without ID found"
|
|
|
264 |
return result
|
265 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
266 |
if check_row_quality(row):
|
267 |
success = update_row_in_supabase(row_id, {'is_fixed': True})
|
268 |
result['success'] = success
|
269 |
-
result['message'] = "
|
|
|
|
|
|
|
|
|
270 |
progress_counter.increment()
|
271 |
return result
|
272 |
|
|
|
273 |
fixed_data = generate_fixed_content(row)
|
274 |
if not fixed_data:
|
275 |
result['message'] = "Failed to fix content"
|
|
|
276 |
progress_counter.increment()
|
277 |
return result
|
278 |
|
|
|
|
|
|
|
|
|
|
|
|
|
279 |
success = update_row_in_supabase(row_id, fixed_data)
|
280 |
result['success'] = success
|
281 |
-
result['message'] = "Successfully fixed and updated" if success else "Failed to update"
|
282 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
283 |
except Exception as e:
|
284 |
result['message'] = f"Error: {str(e)}"
|
285 |
-
|
286 |
-
|
287 |
progress_counter.increment()
|
288 |
-
progress = progress_counter.value()
|
289 |
-
if progress % 10 == 0: # Update progress every 10 rows
|
290 |
-
print(f"Progress: {progress}/{total_rows} rows processed")
|
291 |
-
|
292 |
return result
|
293 |
|
294 |
-
def
|
295 |
"""
|
296 |
-
|
297 |
-
"""
|
298 |
-
logging.info("Starting fix.py script with multithreading.")
|
299 |
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
rows = response.data
|
304 |
-
total_rows = len(rows)
|
305 |
-
logging.info(f"Fetched {total_rows} unfixed rows from exam_contents.")
|
306 |
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
311 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
312 |
# Initialize counters
|
313 |
-
|
314 |
success_count = 0
|
315 |
failure_count = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
316 |
|
317 |
# Create a thread pool
|
318 |
-
max_workers = min(32,
|
319 |
-
|
320 |
-
|
321 |
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
322 |
-
#
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
343 |
|
344 |
except Exception as e:
|
345 |
-
|
346 |
-
|
|
|
347 |
|
348 |
if __name__ == "__main__":
|
349 |
main()
|
|
|
1 |
# fix.py
|
2 |
|
3 |
import os
|
4 |
+
import re
|
5 |
import json
|
6 |
import logging
|
|
|
|
|
|
|
7 |
import concurrent.futures
|
8 |
+
from typing import Dict, Any, Optional, List
|
9 |
import queue
|
10 |
+
import time
|
11 |
+
from datetime import datetime
|
12 |
+
import threading
|
13 |
+
import functools
|
14 |
|
15 |
+
from openai import AzureOpenAI
|
16 |
from supabase import create_client, Client
|
17 |
+
from tqdm import tqdm
|
18 |
from dotenv import load_dotenv
|
19 |
+
from ratelimiter import RateLimiter
|
20 |
+
|
21 |
+
# Set up logging with thread safety and custom formatting
|
22 |
+
class CustomFormatter(logging.Formatter):
|
23 |
+
"""Custom formatter with colors and better formatting"""
|
24 |
+
grey = "\x1b[38;21m"
|
25 |
+
blue = "\x1b[38;5;39m"
|
26 |
+
yellow = "\x1b[38;5;226m"
|
27 |
+
red = "\x1b[38;5;196m"
|
28 |
+
bold_red = "\x1b[31;1m"
|
29 |
+
reset = "\x1b[0m"
|
30 |
+
|
31 |
+
def __init__(self, fmt):
|
32 |
+
super().__init__()
|
33 |
+
self.fmt = fmt
|
34 |
+
self.FORMATS = {
|
35 |
+
logging.DEBUG: self.grey + self.fmt + self.reset,
|
36 |
+
logging.INFO: self.blue + self.fmt + self.reset,
|
37 |
+
logging.WARNING: self.yellow + self.fmt + self.reset,
|
38 |
+
logging.ERROR: self.red + self.fmt + self.reset,
|
39 |
+
logging.CRITICAL: self.bold_red + self.fmt + self.reset
|
40 |
+
}
|
41 |
+
|
42 |
+
def format(self, record):
|
43 |
+
log_fmt = self.FORMATS.get(record.levelno)
|
44 |
+
formatter = logging.Formatter(log_fmt)
|
45 |
+
return formatter.format(record)
|
46 |
+
|
47 |
+
# Set up logging configuration
|
48 |
+
logger = logging.getLogger('fix')
|
49 |
+
logger.setLevel(logging.INFO)
|
50 |
+
|
51 |
+
# File handler with simple formatting
|
52 |
+
file_handler = logging.FileHandler('fix.log')
|
53 |
+
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
|
54 |
+
logger.addHandler(file_handler)
|
55 |
+
|
56 |
+
# Console handler with color formatting
|
57 |
+
console_handler = logging.StreamHandler()
|
58 |
+
console_handler.setFormatter(CustomFormatter('%(asctime)s - %(levelname)s - %(message)s'))
|
59 |
+
logger.addHandler(console_handler)
|
60 |
+
|
61 |
+
# Create a summary log file for each run
|
62 |
+
current_time = datetime.now().strftime('%Y%m%d_%H%M%S')
|
63 |
+
summary_file = f'fix_summary_{current_time}.log'
|
64 |
+
summary_handler = logging.FileHandler(summary_file)
|
65 |
+
summary_handler.setFormatter(logging.Formatter('%(message)s'))
|
66 |
+
summary_logger = logging.getLogger('summary')
|
67 |
+
summary_logger.addHandler(summary_handler)
|
68 |
+
summary_logger.setLevel(logging.INFO)
|
69 |
|
70 |
# Load environment variables from .env file (if present)
|
71 |
load_dotenv()
|
|
|
74 |
MIN_PASSAGE_WORDS = 100 # Minimum number of words for reading_passage
|
75 |
VALID_CORRECT_ANSWERS = {'A', 'B', 'C', 'D'}
|
76 |
EXAM_TYPES = ["SAT", "IELTS", "TOEFL"]
|
77 |
+
DIFFICULTY_LEVELS = ["Easy", "Medium", "Hard"]
|
78 |
|
79 |
# Load environment variables
|
80 |
SUPABASE_URL = os.getenv("SUPABASE_DB_URL")
|
|
|
98 |
missing_vars.append("AZURE_OPENAI_DEPLOYMENT_NAME")
|
99 |
|
100 |
if missing_vars:
|
101 |
+
error_msg = f"Missing required environment variables: {', '.join(missing_vars)}"
|
102 |
+
logger.error(error_msg)
|
103 |
+
raise ValueError(error_msg)
|
104 |
|
105 |
# Initialize Supabase client
|
106 |
supabase: Client = create_client(SUPABASE_URL, SUPABASE_API_KEY)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
|
108 |
+
# Initialize Azure OpenAI client
|
109 |
client = AzureOpenAI(
|
110 |
+
api_key=AZURE_OPENAI_KEY,
|
111 |
+
api_version=AZURE_OPENAI_API_VERSION,
|
112 |
+
azure_endpoint=AZURE_OPENAI_ENDPOINT
|
113 |
)
|
114 |
|
115 |
# Thread-safe counter for progress tracking
|
116 |
class AtomicCounter:
|
117 |
def __init__(self, initial=0):
|
118 |
self._value = initial
|
119 |
+
self._lock = threading.Lock()
|
120 |
|
121 |
def increment(self):
|
122 |
with self._lock:
|
|
|
127 |
with self._lock:
|
128 |
return self._value
|
129 |
|
130 |
+
class RateLimiter:
|
131 |
+
"""Rate limiter implementation using token bucket algorithm"""
|
132 |
+
def __init__(self, max_calls: int, period: float):
|
133 |
+
self.max_calls = max_calls
|
134 |
+
self.period = period
|
135 |
+
self.calls = []
|
136 |
+
self.lock = threading.Lock()
|
137 |
+
|
138 |
+
def __call__(self, func):
|
139 |
+
@functools.wraps(func)
|
140 |
+
def wrapped(*args, **kwargs):
|
141 |
+
with self.lock:
|
142 |
+
now = time.time()
|
143 |
+
# Remove old calls outside the window
|
144 |
+
self.calls = [call for call in self.calls if call > now - self.period]
|
145 |
+
|
146 |
+
if len(self.calls) >= self.max_calls:
|
147 |
+
sleep_time = self.calls[0] - (now - self.period)
|
148 |
+
if sleep_time > 0:
|
149 |
+
time.sleep(sleep_time)
|
150 |
+
# Recalculate after sleep
|
151 |
+
now = time.time()
|
152 |
+
self.calls = [call for call in self.calls if call > now - self.period]
|
153 |
+
|
154 |
+
self.calls.append(now)
|
155 |
+
|
156 |
+
return func(*args, **kwargs)
|
157 |
+
return wrapped
|
158 |
+
|
159 |
+
# Initialize Rate Limiter: 60 calls per minute
|
160 |
+
rate_limiter = RateLimiter(max_calls=60, period=60)
|
161 |
+
|
162 |
+
@rate_limiter
|
163 |
+
def generate_fixed_content(row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
164 |
+
"""
|
165 |
+
Uses Azure OpenAI to generate fixed content for a row.
|
166 |
+
Returns a dictionary with fixed content or None if generation fails.
|
167 |
+
"""
|
168 |
+
try:
|
169 |
+
# Create system message with formatting requirements
|
170 |
+
system_message = """You are an expert at fixing exam questions. Follow these rules:
|
171 |
+
1. Maintain academic language and tone
|
172 |
+
2. Keep all factual information unchanged
|
173 |
+
3. Fix grammar and clarity issues
|
174 |
+
4. Ensure options are clear and distinct
|
175 |
+
5. Format text consistently"""
|
176 |
+
|
177 |
+
# Create user message with the content to fix
|
178 |
+
user_message = f"""Please fix the following exam question content:
|
179 |
+
Reading Passage: {row.get('reading_passage', '')}
|
180 |
+
Question: {row.get('question_text', '')}
|
181 |
+
Options:
|
182 |
+
A) {row.get('option_a', '')}
|
183 |
+
B) {row.get('option_b', '')}
|
184 |
+
C) {row.get('option_c', '')}
|
185 |
+
D) {row.get('option_d', '')}
|
186 |
+
Explanation: {row.get('explanation', '')}"""
|
187 |
+
|
188 |
+
# Call Azure OpenAI API
|
189 |
+
response = client.chat.completions.create(
|
190 |
+
model=AZURE_OPENAI_DEPLOYMENT_NAME,
|
191 |
+
messages=[
|
192 |
+
{"role": "system", "content": system_message},
|
193 |
+
{"role": "user", "content": user_message}
|
194 |
+
],
|
195 |
+
temperature=0.3,
|
196 |
+
max_tokens=2000,
|
197 |
+
top_p=0.95,
|
198 |
+
frequency_penalty=0,
|
199 |
+
presence_penalty=0
|
200 |
+
)
|
201 |
+
|
202 |
+
# Extract the response content
|
203 |
+
if not response.choices:
|
204 |
+
logger.error("No response generated from OpenAI")
|
205 |
+
return None
|
206 |
+
|
207 |
+
content = response.choices[0].message.content
|
208 |
+
|
209 |
+
# Parse the response using regex
|
210 |
+
fixed_data = {}
|
211 |
+
|
212 |
+
# Extract reading passage
|
213 |
+
reading_match = re.search(r"Reading Passage:\s*(.*?)(?=Question:|$)", content, re.DOTALL)
|
214 |
+
if reading_match:
|
215 |
+
fixed_data['reading_passage'] = reading_match.group(1).strip()
|
216 |
+
|
217 |
+
# Extract question
|
218 |
+
question_match = re.search(r"Question:\s*(.*?)(?=Options:|$)", content, re.DOTALL)
|
219 |
+
if question_match:
|
220 |
+
fixed_data['question_text'] = question_match.group(1).strip()
|
221 |
+
|
222 |
+
# Extract options
|
223 |
+
options_pattern = {
|
224 |
+
'option_a': r"A\)\s*(.*?)(?=B\)|$)",
|
225 |
+
'option_b': r"B\)\s*(.*?)(?=C\)|$)",
|
226 |
+
'option_c': r"C\)\s*(.*?)(?=D\)|$)",
|
227 |
+
'option_d': r"D\)\s*(.*?)(?=Explanation:|$)"
|
228 |
+
}
|
229 |
+
|
230 |
+
for key, pattern in options_pattern.items():
|
231 |
+
match = re.search(pattern, content, re.DOTALL)
|
232 |
+
if match:
|
233 |
+
fixed_data[key] = match.group(1).strip()
|
234 |
+
|
235 |
+
# Extract explanation
|
236 |
+
explanation_match = re.search(r"Explanation:\s*(.*?)$", content, re.DOTALL)
|
237 |
+
if explanation_match:
|
238 |
+
fixed_data['explanation'] = explanation_match.group(1).strip()
|
239 |
+
|
240 |
+
# Validate that all required fields are present
|
241 |
+
required_fields = ['reading_passage', 'question_text', 'option_a', 'option_b', 'option_c', 'option_d', 'explanation']
|
242 |
+
if not all(field in fixed_data for field in required_fields):
|
243 |
+
logger.error("Missing required fields in generated content")
|
244 |
+
return None
|
245 |
+
|
246 |
+
# Copy over unchanged fields
|
247 |
+
for key in row:
|
248 |
+
if key not in fixed_data and key != 'id':
|
249 |
+
fixed_data[key] = row[key]
|
250 |
+
|
251 |
+
return fixed_data
|
252 |
+
|
253 |
+
except Exception as e:
|
254 |
+
logger.error(f"Error generating fixed content: {str(e)}")
|
255 |
+
return None
|
256 |
+
|
257 |
def word_count(text: str) -> int:
|
258 |
"""Returns the number of words in a given text."""
|
259 |
return len(text.split())
|
|
|
276 |
# Skip if already fixed
|
277 |
if row.get('is_fixed'):
|
278 |
return True
|
279 |
+
|
280 |
+
# Required fields must be present and non-empty
|
281 |
required_fields = [
|
282 |
'exam_type', 'content_type', 'exam_section', 'domain', 'subdomain',
|
283 |
'topic', 'difficulty_level', 'reading_passage', 'question_text',
|
284 |
'option_a', 'option_b', 'option_c', 'option_d', 'correct_answer',
|
285 |
'explanation'
|
286 |
]
|
287 |
+
|
288 |
# Check for missing or empty required fields
|
289 |
for field in required_fields:
|
290 |
+
value = row.get(field, '').strip() if isinstance(row.get(field), str) else row.get(field)
|
291 |
+
if not value:
|
292 |
return False
|
293 |
+
|
294 |
+
# Check for valid exam type
|
295 |
+
if row['exam_type'] not in EXAM_TYPES:
|
296 |
+
return False
|
297 |
+
|
298 |
+
# Check for valid difficulty level
|
299 |
+
if row['difficulty_level'] not in DIFFICULTY_LEVELS:
|
300 |
+
return False
|
301 |
+
|
302 |
+
# Check for valid correct answer format
|
303 |
+
if not is_valid_correct_answer(row['correct_answer']):
|
304 |
+
return False
|
305 |
+
|
306 |
+
# Check for common OCR and formatting issues
|
307 |
text_fields = ['reading_passage', 'question_text', 'option_a', 'option_b', 'option_c', 'option_d', 'explanation']
|
308 |
for field in text_fields:
|
309 |
text = row.get(field, '')
|
310 |
if isinstance(text, str):
|
311 |
+
# Check for OCR artifacts
|
312 |
+
if any(artifact in text.lower() for artifact in [
|
313 |
+
'arebasedonthe', 'lineno', 'click here', 'seenext', 'seebelow',
|
314 |
+
'answerthefollowing', 'choosethebest', 'selectthe'
|
315 |
+
]):
|
316 |
+
return False
|
317 |
+
|
318 |
+
# Check for formatting issues
|
319 |
+
if text.count('.') > 20: # Too many periods might indicate formatting issues
|
320 |
+
return False
|
321 |
+
if text.count('\n') > 20: # Too many newlines might indicate formatting issues
|
322 |
+
return False
|
323 |
+
if len(text.split()) < 2: # Text should have at least 2 words
|
324 |
return False
|
|
|
|
|
325 |
|
326 |
+
# Check minimum length requirements
|
327 |
+
if len(row['reading_passage'].split()) < MIN_PASSAGE_WORDS:
|
328 |
+
return False
|
|
|
|
|
|
|
329 |
|
330 |
+
# Check for duplicate options
|
331 |
+
options = [row['option_a'], row['option_b'], row['option_c'], row['option_d']]
|
332 |
+
if len(set(options)) != len(options):
|
333 |
+
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
334 |
|
335 |
+
# Check for explanation quality
|
336 |
+
explanation = row['explanation']
|
337 |
+
if len(explanation.split()) < 10: # Explanation should be reasonably detailed
|
338 |
+
return False
|
339 |
+
if not any(word in explanation.lower() for word in ['because', 'since', 'as', 'therefore', 'thus', 'hence']):
|
340 |
+
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
341 |
|
342 |
+
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
343 |
|
344 |
def update_row_in_supabase(row_id: str, fixed_data: Dict[str, Any]) -> bool:
|
345 |
"""
|
|
|
348 |
"""
|
349 |
try:
|
350 |
response = supabase.table("exam_contents").update(fixed_data).eq("id", row_id).execute()
|
351 |
+
|
352 |
# Check if data exists in the response
|
353 |
if response.data:
|
354 |
+
logger.debug(f"HTTP Request: PATCH https://{SUPABASE_URL}/rest/v1/exam_contents?id=eq.{row_id} \"HTTP/2 200 OK\"")
|
355 |
+
logger.info(f"Row {row_id}: Successfully updated.")
|
356 |
return True
|
357 |
else:
|
358 |
+
logger.error(f"Row {row_id}: Failed to update.")
|
359 |
return False
|
360 |
|
361 |
except Exception as e:
|
362 |
+
logger.error(f"Row {row_id}: Exception while updating - {str(e)}")
|
363 |
return False
|
364 |
|
365 |
+
def process_row(row: Dict[str, Any], progress_counter: AtomicCounter, total_rows: int, row_number: int) -> Dict[str, Any]:
|
366 |
+
"""Process a single row with progress tracking."""
|
|
|
|
|
|
|
|
|
367 |
result = {
|
368 |
+
'row_id': row.get('id'),
|
369 |
'success': False,
|
370 |
+
'message': '',
|
371 |
+
'changes_made': []
|
372 |
}
|
373 |
+
|
374 |
try:
|
375 |
+
row_id = row.get('id')
|
376 |
if not row_id:
|
377 |
result['message'] = "Row without ID found"
|
378 |
+
logger.warning(f"Row {row_number}: {result['message']}")
|
379 |
return result
|
380 |
|
381 |
+
# Check initial quality
|
382 |
+
initial_quality_issues = []
|
383 |
+
if not row.get('reading_passage'):
|
384 |
+
initial_quality_issues.append("Missing reading passage")
|
385 |
+
if not row.get('question_text'):
|
386 |
+
initial_quality_issues.append("Missing question text")
|
387 |
+
if not all(row.get(f'option_{opt}') for opt in ['a', 'b', 'c', 'd']):
|
388 |
+
initial_quality_issues.append("Missing options")
|
389 |
+
if not row.get('correct_answer'):
|
390 |
+
initial_quality_issues.append("Missing correct answer")
|
391 |
+
|
392 |
+
if initial_quality_issues:
|
393 |
+
logger.info(f"Row {row_number}: Quality issues found - {', '.join(initial_quality_issues)}")
|
394 |
+
|
395 |
if check_row_quality(row):
|
396 |
success = update_row_in_supabase(row_id, {'is_fixed': True})
|
397 |
result['success'] = success
|
398 |
+
result['message'] = "Already good quality, marked as fixed"
|
399 |
+
if success:
|
400 |
+
logger.info(f"Row {row_number}: Already good quality. Marked as fixed.")
|
401 |
+
else:
|
402 |
+
logger.error(f"Row {row_number}: Failed to mark as fixed.")
|
403 |
progress_counter.increment()
|
404 |
return result
|
405 |
|
406 |
+
# Generate fixed content
|
407 |
fixed_data = generate_fixed_content(row)
|
408 |
if not fixed_data:
|
409 |
result['message'] = "Failed to fix content"
|
410 |
+
logger.error(f"Row {row_number}: Failed to generate fixed content.")
|
411 |
progress_counter.increment()
|
412 |
return result
|
413 |
|
414 |
+
# Compare changes
|
415 |
+
for field in ['reading_passage', 'question_text', 'option_a', 'option_b', 'option_c', 'option_d', 'explanation']:
|
416 |
+
if fixed_data.get(field) != row.get(field):
|
417 |
+
result['changes_made'].append(field)
|
418 |
+
|
419 |
+
fixed_data['is_fixed'] = True
|
420 |
success = update_row_in_supabase(row_id, fixed_data)
|
421 |
result['success'] = success
|
|
|
422 |
|
423 |
+
if success:
|
424 |
+
changes = ', '.join(result['changes_made']) if result['changes_made'] else 'No changes needed'
|
425 |
+
result['message'] = f"Fixed successfully. Changes in: {changes}"
|
426 |
+
logger.info(f"Row {row_number}: Fixed successfully. Modified: {changes}")
|
427 |
+
else:
|
428 |
+
result['message'] = "Failed to update after fixing"
|
429 |
+
logger.error(f"Row {row_number}: Failed to update after fixing.")
|
430 |
+
|
431 |
except Exception as e:
|
432 |
result['message'] = f"Error: {str(e)}"
|
433 |
+
logger.error(f"Row {row_number}: Error processing - {str(e)}")
|
434 |
+
|
435 |
progress_counter.increment()
|
|
|
|
|
|
|
|
|
436 |
return result
|
437 |
|
438 |
+
def fetch_all_unfixed_rows(supabase_client: Client, batch_size: int = 1000):
|
439 |
"""
|
440 |
+
Fetches all unfixed rows from the exam_contents table in batches.
|
|
|
|
|
441 |
|
442 |
+
Args:
|
443 |
+
supabase_client (Client): The Supabase client instance.
|
444 |
+
batch_size (int): Number of rows to fetch per batch.
|
|
|
|
|
|
|
445 |
|
446 |
+
Yields:
|
447 |
+
List[Dict[str, Any]]: A batch of rows.
|
448 |
+
"""
|
449 |
+
# Initialize the starting range
|
450 |
+
start = 0
|
451 |
+
while True:
|
452 |
+
# Fetch a batch of rows
|
453 |
+
response = supabase_client.table("exam_contents")\
|
454 |
+
.select("*")\
|
455 |
+
.eq("is_fixed", False)\
|
456 |
+
.range(start, start + batch_size - 1)\
|
457 |
+
.execute()
|
458 |
+
|
459 |
+
batch = response.data
|
460 |
+
if not batch:
|
461 |
+
break # No more rows to fetch
|
462 |
+
|
463 |
+
yield batch
|
464 |
+
start += batch_size
|
465 |
|
466 |
+
def main():
|
467 |
+
"""Main function to process and fix exam questions in Supabase using multithreading."""
|
468 |
+
start_time = time.time()
|
469 |
+
logger.info("Starting fix.py script")
|
470 |
+
summary_logger.info("\n=== Question Fix Summary ===\n")
|
471 |
+
|
472 |
+
try:
|
473 |
# Initialize counters
|
474 |
+
total_rows = 0
|
475 |
success_count = 0
|
476 |
failure_count = 0
|
477 |
+
changes_by_field = {
|
478 |
+
'reading_passage': 0,
|
479 |
+
'question_text': 0,
|
480 |
+
'option_a': 0,
|
481 |
+
'option_b': 0,
|
482 |
+
'option_c': 0,
|
483 |
+
'option_d': 0,
|
484 |
+
'explanation': 0
|
485 |
+
}
|
486 |
|
487 |
# Create a thread pool
|
488 |
+
max_workers = min(32, os.cpu_count() * 2) # Adjust based on CPU cores
|
489 |
+
logger.info(f"Initializing with {max_workers} threads")
|
490 |
+
|
491 |
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
492 |
+
# Initialize progress tracking
|
493 |
+
progress_counter = AtomicCounter()
|
494 |
+
futures = []
|
495 |
+
|
496 |
+
# Process rows in batches
|
497 |
+
for batch in fetch_all_unfixed_rows(supabase):
|
498 |
+
total_rows += len(batch)
|
499 |
+
for i, row in enumerate(batch):
|
500 |
+
future = executor.submit(process_row, row, progress_counter, total_rows, i + 1)
|
501 |
+
futures.append(future)
|
502 |
+
|
503 |
+
# Track progress with tqdm
|
504 |
+
with tqdm(total=total_rows, desc="Processing Rows", unit="row", dynamic_ncols=True) as pbar:
|
505 |
+
for future in concurrent.futures.as_completed(futures):
|
506 |
+
result = future.result()
|
507 |
+
if result['success']:
|
508 |
+
success_count += 1
|
509 |
+
# Update changes counter
|
510 |
+
for field in result['changes_made']:
|
511 |
+
changes_by_field[field] = changes_by_field.get(field, 0) + 1
|
512 |
+
else:
|
513 |
+
failure_count += 1
|
514 |
+
pbar.update(1)
|
515 |
+
|
516 |
+
# Calculate execution time
|
517 |
+
execution_time = time.time() - start_time
|
518 |
+
|
519 |
+
# Log final statistics
|
520 |
+
summary = [
|
521 |
+
"\n=== Final Statistics ===",
|
522 |
+
f"Total questions processed: {total_rows}",
|
523 |
+
f"Successful updates: {success_count}",
|
524 |
+
f"Failed updates: {failure_count}",
|
525 |
+
f"Execution time: {execution_time:.2f} seconds",
|
526 |
+
"\nChanges by field:",
|
527 |
+
*[f"- {field}: {count}" for field, count in changes_by_field.items() if count > 0],
|
528 |
+
"\n=== End of Summary ===\n"
|
529 |
+
]
|
530 |
+
|
531 |
+
# Log to both console and summary file
|
532 |
+
for line in summary:
|
533 |
+
logger.info(line)
|
534 |
+
summary_logger.info(line)
|
535 |
|
536 |
except Exception as e:
|
537 |
+
error_msg = f"An unexpected error occurred: {str(e)}"
|
538 |
+
logger.error(error_msg)
|
539 |
+
summary_logger.error(error_msg)
|
540 |
|
541 |
if __name__ == "__main__":
|
542 |
main()
|