poemsforaphrodite commited on
Commit
bc93e69
·
verified ·
1 Parent(s): 0efda62

Update fix.py

Browse files
Files changed (1) hide show
  1. fix.py +380 -187
fix.py CHANGED
@@ -1,30 +1,71 @@
1
  # fix.py
2
 
3
  import os
 
4
  import json
5
  import logging
6
- import re
7
- from typing import Dict, Any, Optional
8
- from io import BytesIO
9
  import concurrent.futures
10
- from threading import Lock
11
  import queue
 
 
 
 
12
 
13
- import openai
14
  from supabase import create_client, Client
 
15
  from dotenv import load_dotenv
16
- from tqdm import tqdm # For progress bar
17
- from openai import AzureOpenAI
18
-
19
- # Set up logging with thread safety
20
- logging.basicConfig(
21
- level=logging.INFO,
22
- format='%(asctime)s - %(threadName)s - %(levelname)s - %(message)s',
23
- handlers=[
24
- logging.FileHandler('fix.log'),
25
- logging.StreamHandler()
26
- ]
27
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  # Load environment variables from .env file (if present)
30
  load_dotenv()
@@ -33,6 +74,7 @@ load_dotenv()
33
  MIN_PASSAGE_WORDS = 100 # Minimum number of words for reading_passage
34
  VALID_CORRECT_ANSWERS = {'A', 'B', 'C', 'D'}
35
  EXAM_TYPES = ["SAT", "IELTS", "TOEFL"]
 
36
 
37
  # Load environment variables
38
  SUPABASE_URL = os.getenv("SUPABASE_DB_URL")
@@ -56,38 +98,25 @@ if not AZURE_OPENAI_DEPLOYMENT_NAME:
56
  missing_vars.append("AZURE_OPENAI_DEPLOYMENT_NAME")
57
 
58
  if missing_vars:
59
- logging.error(f"Missing environment variables: {', '.join(missing_vars)}")
60
- raise EnvironmentError(f"Missing environment variables: {', '.join(missing_vars)}")
 
61
 
62
  # Initialize Supabase client
63
  supabase: Client = create_client(SUPABASE_URL, SUPABASE_API_KEY)
64
- logging.info("Connected to Supabase successfully.")
65
-
66
- # Initialize OpenAI for Azure
67
- openai.api_type = "azure"
68
- openai.api_key = AZURE_OPENAI_KEY
69
- openai.api_base = AZURE_OPENAI_ENDPOINT
70
- openai.api_version = AZURE_OPENAI_API_VERSION
71
-
72
- # Set up Azure OpenAI client
73
- API_KEY = os.getenv("AZURE_OPENAI_KEY")
74
- ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
75
- DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")
76
-
77
- if not API_KEY or not ENDPOINT or not DEPLOYMENT_NAME:
78
- raise ValueError("Azure OpenAI configuration is incomplete.")
79
 
 
80
  client = AzureOpenAI(
81
- api_key=API_KEY,
82
- api_version="2024-02-15-preview",
83
- azure_endpoint=ENDPOINT
84
  )
85
 
86
  # Thread-safe counter for progress tracking
87
  class AtomicCounter:
88
  def __init__(self, initial=0):
89
  self._value = initial
90
- self._lock = Lock()
91
 
92
  def increment(self):
93
  with self._lock:
@@ -98,6 +127,133 @@ class AtomicCounter:
98
  with self._lock:
99
  return self._value
100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  def word_count(text: str) -> int:
102
  """Returns the number of words in a given text."""
103
  return len(text.split())
@@ -120,111 +276,70 @@ def check_row_quality(row: Dict[str, Any]) -> bool:
120
  # Skip if already fixed
121
  if row.get('is_fixed'):
122
  return True
123
-
 
124
  required_fields = [
125
  'exam_type', 'content_type', 'exam_section', 'domain', 'subdomain',
126
  'topic', 'difficulty_level', 'reading_passage', 'question_text',
127
  'option_a', 'option_b', 'option_c', 'option_d', 'correct_answer',
128
  'explanation'
129
  ]
130
-
131
  # Check for missing or empty required fields
132
  for field in required_fields:
133
- if not row.get(field):
 
134
  return False
135
-
136
- # Check for OCR artifacts in text fields
 
 
 
 
 
 
 
 
 
 
 
 
137
  text_fields = ['reading_passage', 'question_text', 'option_a', 'option_b', 'option_c', 'option_d', 'explanation']
138
  for field in text_fields:
139
  text = row.get(field, '')
140
  if isinstance(text, str):
141
- if 'arebasedonthe' in text or text.count('.') > 20 or 'Line' in text:
 
 
 
 
 
 
 
 
 
 
 
 
142
  return False
143
-
144
- return True
145
 
146
- def generate_fixed_content(row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
147
- """
148
- Uses Azure OpenAI to generate fixed content for a row.
149
- Returns a dictionary with fixed fields or None if failed.
150
- """
151
- prompt = f"""Fix and improve the following exam question. Clean up any OCR artifacts, fix formatting issues, and ensure high quality.
152
 
153
- Current Question:
154
- Reading Passage: {row.get('reading_passage', '')}
155
- Question: {row.get('question_text', '')}
156
- Options:
157
- A) {row.get('option_a', '')}
158
- B) {row.get('option_b', '')}
159
- C) {row.get('option_c', '')}
160
- D) {row.get('option_d', '')}
161
- Correct Answer: {row.get('correct_answer', '')}
162
- Explanation: {row.get('explanation', '')}
163
-
164
- Requirements:
165
- 1. Clean up any OCR artifacts and formatting issues
166
- 2. Maintain the same meaning and difficulty level
167
- 3. Keep the same correct answer
168
- 4. Ensure the explanation clearly justifies the answer
169
- 5. Make sure all text is properly formatted and readable
170
- 6. Preserve all important content and details
171
- 7. Fix any spacing or punctuation issues
172
-
173
- Return a JSON object with the following fields:
174
- {{
175
- "reading_passage": "cleaned passage",
176
- "question_text": "cleaned question",
177
- "option_a": "cleaned option A",
178
- "option_b": "cleaned option B",
179
- "option_c": "cleaned option C",
180
- "option_d": "cleaned option D",
181
- "explanation": "cleaned explanation"
182
- }}"""
183
 
184
- try:
185
- response = client.chat.completions.create(
186
- model=DEPLOYMENT_NAME,
187
- messages=[
188
- {
189
- "role": "system",
190
- "content": "You are an expert at fixing and improving exam questions. Clean up formatting while preserving meaning."
191
- },
192
- {"role": "user", "content": prompt}
193
- ],
194
- response_format={"type": "json_object"},
195
- temperature=0.0
196
- )
197
-
198
- fixed_content = json.loads(response.choices[0].message.content)
199
-
200
- # Preserve original fields and update only the fixed ones
201
- updated_data = row.copy()
202
- updated_data.update(fixed_content)
203
- updated_data['is_fixed'] = True
204
-
205
- return updated_data
206
-
207
- except Exception as e:
208
- logging.error(f"Error generating fixed content: {str(e)}")
209
- return None
210
 
211
- def extract_json(text: str) -> Optional[str]:
212
- """
213
- Extracts JSON object from a block of text.
214
- Returns the JSON string or None if not found.
215
- """
216
- try:
217
- # Find the first { and the last }
218
- start = text.find('{')
219
- end = text.rfind('}')
220
- if start == -1 or end == -1:
221
- return None
222
- json_str = text[start:end+1]
223
- # Validate JSON
224
- json.loads(json_str)
225
- return json_str
226
- except json.JSONDecodeError:
227
- return None
228
 
229
  def update_row_in_supabase(row_id: str, fixed_data: Dict[str, Any]) -> bool:
230
  """
@@ -233,117 +348,195 @@ def update_row_in_supabase(row_id: str, fixed_data: Dict[str, Any]) -> bool:
233
  """
234
  try:
235
  response = supabase.table("exam_contents").update(fixed_data).eq("id", row_id).execute()
236
-
237
  # Check if data exists in the response
238
  if response.data:
239
- logging.info(f"Successfully updated row ID {row_id}.")
 
240
  return True
241
  else:
242
- logging.error(f"Failed to update row ID {row_id}.")
243
  return False
244
 
245
  except Exception as e:
246
- logging.error(f"Exception while updating row ID {row_id}: {str(e)}")
247
  return False
248
 
249
- def process_row(row: Dict[str, Any], progress_counter: AtomicCounter, total_rows: int) -> Dict[str, Any]:
250
- """
251
- Process a single row with progress tracking.
252
- Returns a dictionary with the results.
253
- """
254
- row_id = row.get('id')
255
  result = {
256
- 'row_id': row_id,
257
  'success': False,
258
- 'message': ''
 
259
  }
260
-
261
  try:
 
262
  if not row_id:
263
  result['message'] = "Row without ID found"
 
264
  return result
265
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
  if check_row_quality(row):
267
  success = update_row_in_supabase(row_id, {'is_fixed': True})
268
  result['success'] = success
269
- result['message'] = "Good quality, marked as fixed"
 
 
 
 
270
  progress_counter.increment()
271
  return result
272
 
 
273
  fixed_data = generate_fixed_content(row)
274
  if not fixed_data:
275
  result['message'] = "Failed to fix content"
 
276
  progress_counter.increment()
277
  return result
278
 
 
 
 
 
 
 
279
  success = update_row_in_supabase(row_id, fixed_data)
280
  result['success'] = success
281
- result['message'] = "Successfully fixed and updated" if success else "Failed to update"
282
 
 
 
 
 
 
 
 
 
283
  except Exception as e:
284
  result['message'] = f"Error: {str(e)}"
285
- logging.error(f"Error processing row {row_id}: {str(e)}")
286
-
287
  progress_counter.increment()
288
- progress = progress_counter.value()
289
- if progress % 10 == 0: # Update progress every 10 rows
290
- print(f"Progress: {progress}/{total_rows} rows processed")
291
-
292
  return result
293
 
294
- def main():
295
  """
296
- Main function to process and fix exam questions in Supabase using multithreading.
297
- """
298
- logging.info("Starting fix.py script with multithreading.")
299
 
300
- try:
301
- # Fetch only unfixed rows from exam_contents
302
- response = supabase.table("exam_contents").select("*").eq("is_fixed", False).execute()
303
- rows = response.data
304
- total_rows = len(rows)
305
- logging.info(f"Fetched {total_rows} unfixed rows from exam_contents.")
306
 
307
- if total_rows == 0:
308
- logging.info("No unfixed rows found in exam_contents. Exiting.")
309
- print("No unfixed rows found in exam_contents. Exiting.")
310
- return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
 
 
 
 
 
 
 
 
312
  # Initialize counters
313
- progress_counter = AtomicCounter()
314
  success_count = 0
315
  failure_count = 0
 
 
 
 
 
 
 
 
 
316
 
317
  # Create a thread pool
318
- max_workers = min(32, total_rows) # Cap at 32 threads or total rows, whichever is smaller
319
- print(f"Starting processing with {max_workers} threads...")
320
-
321
  with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
322
- # Submit all rows for processing
323
- future_to_row = {
324
- executor.submit(process_row, row, progress_counter, total_rows): row
325
- for row in rows
326
- }
327
-
328
- # Process completed futures as they finish
329
- for future in concurrent.futures.as_completed(future_to_row):
330
- result = future.result()
331
- if result['success']:
332
- success_count += 1
333
- else:
334
- failure_count += 1
335
- logging.warning(f"Failed to process row {result['row_id']}: {result['message']}")
336
-
337
- # Final statistics
338
- logging.info(f"Processing completed. Success: {success_count}, Failures: {failure_count}")
339
- print(f"\nProcessing completed:")
340
- print(f"Total rows processed: {total_rows}")
341
- print(f"Successful updates: {success_count}")
342
- print(f"Failed updates: {failure_count}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
343
 
344
  except Exception as e:
345
- logging.error(f"An unexpected error occurred: {str(e)}")
346
- print(f"An unexpected error occurred: {str(e)}")
 
347
 
348
  if __name__ == "__main__":
349
  main()
 
1
  # fix.py
2
 
3
  import os
4
+ import re
5
  import json
6
  import logging
 
 
 
7
  import concurrent.futures
8
+ from typing import Dict, Any, Optional, List
9
  import queue
10
+ import time
11
+ from datetime import datetime
12
+ import threading
13
+ import functools
14
 
15
+ from openai import AzureOpenAI
16
  from supabase import create_client, Client
17
+ from tqdm import tqdm
18
  from dotenv import load_dotenv
19
+ from ratelimiter import RateLimiter
20
+
21
+ # Set up logging with thread safety and custom formatting
22
+ class CustomFormatter(logging.Formatter):
23
+ """Custom formatter with colors and better formatting"""
24
+ grey = "\x1b[38;21m"
25
+ blue = "\x1b[38;5;39m"
26
+ yellow = "\x1b[38;5;226m"
27
+ red = "\x1b[38;5;196m"
28
+ bold_red = "\x1b[31;1m"
29
+ reset = "\x1b[0m"
30
+
31
+ def __init__(self, fmt):
32
+ super().__init__()
33
+ self.fmt = fmt
34
+ self.FORMATS = {
35
+ logging.DEBUG: self.grey + self.fmt + self.reset,
36
+ logging.INFO: self.blue + self.fmt + self.reset,
37
+ logging.WARNING: self.yellow + self.fmt + self.reset,
38
+ logging.ERROR: self.red + self.fmt + self.reset,
39
+ logging.CRITICAL: self.bold_red + self.fmt + self.reset
40
+ }
41
+
42
+ def format(self, record):
43
+ log_fmt = self.FORMATS.get(record.levelno)
44
+ formatter = logging.Formatter(log_fmt)
45
+ return formatter.format(record)
46
+
47
+ # Set up logging configuration
48
+ logger = logging.getLogger('fix')
49
+ logger.setLevel(logging.INFO)
50
+
51
+ # File handler with simple formatting
52
+ file_handler = logging.FileHandler('fix.log')
53
+ file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
54
+ logger.addHandler(file_handler)
55
+
56
+ # Console handler with color formatting
57
+ console_handler = logging.StreamHandler()
58
+ console_handler.setFormatter(CustomFormatter('%(asctime)s - %(levelname)s - %(message)s'))
59
+ logger.addHandler(console_handler)
60
+
61
+ # Create a summary log file for each run
62
+ current_time = datetime.now().strftime('%Y%m%d_%H%M%S')
63
+ summary_file = f'fix_summary_{current_time}.log'
64
+ summary_handler = logging.FileHandler(summary_file)
65
+ summary_handler.setFormatter(logging.Formatter('%(message)s'))
66
+ summary_logger = logging.getLogger('summary')
67
+ summary_logger.addHandler(summary_handler)
68
+ summary_logger.setLevel(logging.INFO)
69
 
70
  # Load environment variables from .env file (if present)
71
  load_dotenv()
 
74
  MIN_PASSAGE_WORDS = 100 # Minimum number of words for reading_passage
75
  VALID_CORRECT_ANSWERS = {'A', 'B', 'C', 'D'}
76
  EXAM_TYPES = ["SAT", "IELTS", "TOEFL"]
77
+ DIFFICULTY_LEVELS = ["Easy", "Medium", "Hard"]
78
 
79
  # Load environment variables
80
  SUPABASE_URL = os.getenv("SUPABASE_DB_URL")
 
98
  missing_vars.append("AZURE_OPENAI_DEPLOYMENT_NAME")
99
 
100
  if missing_vars:
101
+ error_msg = f"Missing required environment variables: {', '.join(missing_vars)}"
102
+ logger.error(error_msg)
103
+ raise ValueError(error_msg)
104
 
105
  # Initialize Supabase client
106
  supabase: Client = create_client(SUPABASE_URL, SUPABASE_API_KEY)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
+ # Initialize Azure OpenAI client
109
  client = AzureOpenAI(
110
+ api_key=AZURE_OPENAI_KEY,
111
+ api_version=AZURE_OPENAI_API_VERSION,
112
+ azure_endpoint=AZURE_OPENAI_ENDPOINT
113
  )
114
 
115
  # Thread-safe counter for progress tracking
116
  class AtomicCounter:
117
  def __init__(self, initial=0):
118
  self._value = initial
119
+ self._lock = threading.Lock()
120
 
121
  def increment(self):
122
  with self._lock:
 
127
  with self._lock:
128
  return self._value
129
 
130
+ class RateLimiter:
131
+ """Rate limiter implementation using token bucket algorithm"""
132
+ def __init__(self, max_calls: int, period: float):
133
+ self.max_calls = max_calls
134
+ self.period = period
135
+ self.calls = []
136
+ self.lock = threading.Lock()
137
+
138
+ def __call__(self, func):
139
+ @functools.wraps(func)
140
+ def wrapped(*args, **kwargs):
141
+ with self.lock:
142
+ now = time.time()
143
+ # Remove old calls outside the window
144
+ self.calls = [call for call in self.calls if call > now - self.period]
145
+
146
+ if len(self.calls) >= self.max_calls:
147
+ sleep_time = self.calls[0] - (now - self.period)
148
+ if sleep_time > 0:
149
+ time.sleep(sleep_time)
150
+ # Recalculate after sleep
151
+ now = time.time()
152
+ self.calls = [call for call in self.calls if call > now - self.period]
153
+
154
+ self.calls.append(now)
155
+
156
+ return func(*args, **kwargs)
157
+ return wrapped
158
+
159
+ # Initialize Rate Limiter: 60 calls per minute
160
+ rate_limiter = RateLimiter(max_calls=60, period=60)
161
+
162
+ @rate_limiter
163
+ def generate_fixed_content(row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
164
+ """
165
+ Uses Azure OpenAI to generate fixed content for a row.
166
+ Returns a dictionary with fixed content or None if generation fails.
167
+ """
168
+ try:
169
+ # Create system message with formatting requirements
170
+ system_message = """You are an expert at fixing exam questions. Follow these rules:
171
+ 1. Maintain academic language and tone
172
+ 2. Keep all factual information unchanged
173
+ 3. Fix grammar and clarity issues
174
+ 4. Ensure options are clear and distinct
175
+ 5. Format text consistently"""
176
+
177
+ # Create user message with the content to fix
178
+ user_message = f"""Please fix the following exam question content:
179
+ Reading Passage: {row.get('reading_passage', '')}
180
+ Question: {row.get('question_text', '')}
181
+ Options:
182
+ A) {row.get('option_a', '')}
183
+ B) {row.get('option_b', '')}
184
+ C) {row.get('option_c', '')}
185
+ D) {row.get('option_d', '')}
186
+ Explanation: {row.get('explanation', '')}"""
187
+
188
+ # Call Azure OpenAI API
189
+ response = client.chat.completions.create(
190
+ model=AZURE_OPENAI_DEPLOYMENT_NAME,
191
+ messages=[
192
+ {"role": "system", "content": system_message},
193
+ {"role": "user", "content": user_message}
194
+ ],
195
+ temperature=0.3,
196
+ max_tokens=2000,
197
+ top_p=0.95,
198
+ frequency_penalty=0,
199
+ presence_penalty=0
200
+ )
201
+
202
+ # Extract the response content
203
+ if not response.choices:
204
+ logger.error("No response generated from OpenAI")
205
+ return None
206
+
207
+ content = response.choices[0].message.content
208
+
209
+ # Parse the response using regex
210
+ fixed_data = {}
211
+
212
+ # Extract reading passage
213
+ reading_match = re.search(r"Reading Passage:\s*(.*?)(?=Question:|$)", content, re.DOTALL)
214
+ if reading_match:
215
+ fixed_data['reading_passage'] = reading_match.group(1).strip()
216
+
217
+ # Extract question
218
+ question_match = re.search(r"Question:\s*(.*?)(?=Options:|$)", content, re.DOTALL)
219
+ if question_match:
220
+ fixed_data['question_text'] = question_match.group(1).strip()
221
+
222
+ # Extract options
223
+ options_pattern = {
224
+ 'option_a': r"A\)\s*(.*?)(?=B\)|$)",
225
+ 'option_b': r"B\)\s*(.*?)(?=C\)|$)",
226
+ 'option_c': r"C\)\s*(.*?)(?=D\)|$)",
227
+ 'option_d': r"D\)\s*(.*?)(?=Explanation:|$)"
228
+ }
229
+
230
+ for key, pattern in options_pattern.items():
231
+ match = re.search(pattern, content, re.DOTALL)
232
+ if match:
233
+ fixed_data[key] = match.group(1).strip()
234
+
235
+ # Extract explanation
236
+ explanation_match = re.search(r"Explanation:\s*(.*?)$", content, re.DOTALL)
237
+ if explanation_match:
238
+ fixed_data['explanation'] = explanation_match.group(1).strip()
239
+
240
+ # Validate that all required fields are present
241
+ required_fields = ['reading_passage', 'question_text', 'option_a', 'option_b', 'option_c', 'option_d', 'explanation']
242
+ if not all(field in fixed_data for field in required_fields):
243
+ logger.error("Missing required fields in generated content")
244
+ return None
245
+
246
+ # Copy over unchanged fields
247
+ for key in row:
248
+ if key not in fixed_data and key != 'id':
249
+ fixed_data[key] = row[key]
250
+
251
+ return fixed_data
252
+
253
+ except Exception as e:
254
+ logger.error(f"Error generating fixed content: {str(e)}")
255
+ return None
256
+
257
  def word_count(text: str) -> int:
258
  """Returns the number of words in a given text."""
259
  return len(text.split())
 
276
  # Skip if already fixed
277
  if row.get('is_fixed'):
278
  return True
279
+
280
+ # Required fields must be present and non-empty
281
  required_fields = [
282
  'exam_type', 'content_type', 'exam_section', 'domain', 'subdomain',
283
  'topic', 'difficulty_level', 'reading_passage', 'question_text',
284
  'option_a', 'option_b', 'option_c', 'option_d', 'correct_answer',
285
  'explanation'
286
  ]
287
+
288
  # Check for missing or empty required fields
289
  for field in required_fields:
290
+ value = row.get(field, '').strip() if isinstance(row.get(field), str) else row.get(field)
291
+ if not value:
292
  return False
293
+
294
+ # Check for valid exam type
295
+ if row['exam_type'] not in EXAM_TYPES:
296
+ return False
297
+
298
+ # Check for valid difficulty level
299
+ if row['difficulty_level'] not in DIFFICULTY_LEVELS:
300
+ return False
301
+
302
+ # Check for valid correct answer format
303
+ if not is_valid_correct_answer(row['correct_answer']):
304
+ return False
305
+
306
+ # Check for common OCR and formatting issues
307
  text_fields = ['reading_passage', 'question_text', 'option_a', 'option_b', 'option_c', 'option_d', 'explanation']
308
  for field in text_fields:
309
  text = row.get(field, '')
310
  if isinstance(text, str):
311
+ # Check for OCR artifacts
312
+ if any(artifact in text.lower() for artifact in [
313
+ 'arebasedonthe', 'lineno', 'click here', 'seenext', 'seebelow',
314
+ 'answerthefollowing', 'choosethebest', 'selectthe'
315
+ ]):
316
+ return False
317
+
318
+ # Check for formatting issues
319
+ if text.count('.') > 20: # Too many periods might indicate formatting issues
320
+ return False
321
+ if text.count('\n') > 20: # Too many newlines might indicate formatting issues
322
+ return False
323
+ if len(text.split()) < 2: # Text should have at least 2 words
324
  return False
 
 
325
 
326
+ # Check minimum length requirements
327
+ if len(row['reading_passage'].split()) < MIN_PASSAGE_WORDS:
328
+ return False
 
 
 
329
 
330
+ # Check for duplicate options
331
+ options = [row['option_a'], row['option_b'], row['option_c'], row['option_d']]
332
+ if len(set(options)) != len(options):
333
+ return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
334
 
335
+ # Check for explanation quality
336
+ explanation = row['explanation']
337
+ if len(explanation.split()) < 10: # Explanation should be reasonably detailed
338
+ return False
339
+ if not any(word in explanation.lower() for word in ['because', 'since', 'as', 'therefore', 'thus', 'hence']):
340
+ return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
 
342
+ return True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
343
 
344
  def update_row_in_supabase(row_id: str, fixed_data: Dict[str, Any]) -> bool:
345
  """
 
348
  """
349
  try:
350
  response = supabase.table("exam_contents").update(fixed_data).eq("id", row_id).execute()
351
+
352
  # Check if data exists in the response
353
  if response.data:
354
+ logger.debug(f"HTTP Request: PATCH https://{SUPABASE_URL}/rest/v1/exam_contents?id=eq.{row_id} \"HTTP/2 200 OK\"")
355
+ logger.info(f"Row {row_id}: Successfully updated.")
356
  return True
357
  else:
358
+ logger.error(f"Row {row_id}: Failed to update.")
359
  return False
360
 
361
  except Exception as e:
362
+ logger.error(f"Row {row_id}: Exception while updating - {str(e)}")
363
  return False
364
 
365
+ def process_row(row: Dict[str, Any], progress_counter: AtomicCounter, total_rows: int, row_number: int) -> Dict[str, Any]:
366
+ """Process a single row with progress tracking."""
 
 
 
 
367
  result = {
368
+ 'row_id': row.get('id'),
369
  'success': False,
370
+ 'message': '',
371
+ 'changes_made': []
372
  }
373
+
374
  try:
375
+ row_id = row.get('id')
376
  if not row_id:
377
  result['message'] = "Row without ID found"
378
+ logger.warning(f"Row {row_number}: {result['message']}")
379
  return result
380
 
381
+ # Check initial quality
382
+ initial_quality_issues = []
383
+ if not row.get('reading_passage'):
384
+ initial_quality_issues.append("Missing reading passage")
385
+ if not row.get('question_text'):
386
+ initial_quality_issues.append("Missing question text")
387
+ if not all(row.get(f'option_{opt}') for opt in ['a', 'b', 'c', 'd']):
388
+ initial_quality_issues.append("Missing options")
389
+ if not row.get('correct_answer'):
390
+ initial_quality_issues.append("Missing correct answer")
391
+
392
+ if initial_quality_issues:
393
+ logger.info(f"Row {row_number}: Quality issues found - {', '.join(initial_quality_issues)}")
394
+
395
  if check_row_quality(row):
396
  success = update_row_in_supabase(row_id, {'is_fixed': True})
397
  result['success'] = success
398
+ result['message'] = "Already good quality, marked as fixed"
399
+ if success:
400
+ logger.info(f"Row {row_number}: Already good quality. Marked as fixed.")
401
+ else:
402
+ logger.error(f"Row {row_number}: Failed to mark as fixed.")
403
  progress_counter.increment()
404
  return result
405
 
406
+ # Generate fixed content
407
  fixed_data = generate_fixed_content(row)
408
  if not fixed_data:
409
  result['message'] = "Failed to fix content"
410
+ logger.error(f"Row {row_number}: Failed to generate fixed content.")
411
  progress_counter.increment()
412
  return result
413
 
414
+ # Compare changes
415
+ for field in ['reading_passage', 'question_text', 'option_a', 'option_b', 'option_c', 'option_d', 'explanation']:
416
+ if fixed_data.get(field) != row.get(field):
417
+ result['changes_made'].append(field)
418
+
419
+ fixed_data['is_fixed'] = True
420
  success = update_row_in_supabase(row_id, fixed_data)
421
  result['success'] = success
 
422
 
423
+ if success:
424
+ changes = ', '.join(result['changes_made']) if result['changes_made'] else 'No changes needed'
425
+ result['message'] = f"Fixed successfully. Changes in: {changes}"
426
+ logger.info(f"Row {row_number}: Fixed successfully. Modified: {changes}")
427
+ else:
428
+ result['message'] = "Failed to update after fixing"
429
+ logger.error(f"Row {row_number}: Failed to update after fixing.")
430
+
431
  except Exception as e:
432
  result['message'] = f"Error: {str(e)}"
433
+ logger.error(f"Row {row_number}: Error processing - {str(e)}")
434
+
435
  progress_counter.increment()
 
 
 
 
436
  return result
437
 
438
+ def fetch_all_unfixed_rows(supabase_client: Client, batch_size: int = 1000):
439
  """
440
+ Fetches all unfixed rows from the exam_contents table in batches.
 
 
441
 
442
+ Args:
443
+ supabase_client (Client): The Supabase client instance.
444
+ batch_size (int): Number of rows to fetch per batch.
 
 
 
445
 
446
+ Yields:
447
+ List[Dict[str, Any]]: A batch of rows.
448
+ """
449
+ # Initialize the starting range
450
+ start = 0
451
+ while True:
452
+ # Fetch a batch of rows
453
+ response = supabase_client.table("exam_contents")\
454
+ .select("*")\
455
+ .eq("is_fixed", False)\
456
+ .range(start, start + batch_size - 1)\
457
+ .execute()
458
+
459
+ batch = response.data
460
+ if not batch:
461
+ break # No more rows to fetch
462
+
463
+ yield batch
464
+ start += batch_size
465
 
466
+ def main():
467
+ """Main function to process and fix exam questions in Supabase using multithreading."""
468
+ start_time = time.time()
469
+ logger.info("Starting fix.py script")
470
+ summary_logger.info("\n=== Question Fix Summary ===\n")
471
+
472
+ try:
473
  # Initialize counters
474
+ total_rows = 0
475
  success_count = 0
476
  failure_count = 0
477
+ changes_by_field = {
478
+ 'reading_passage': 0,
479
+ 'question_text': 0,
480
+ 'option_a': 0,
481
+ 'option_b': 0,
482
+ 'option_c': 0,
483
+ 'option_d': 0,
484
+ 'explanation': 0
485
+ }
486
 
487
  # Create a thread pool
488
+ max_workers = min(32, os.cpu_count() * 2) # Adjust based on CPU cores
489
+ logger.info(f"Initializing with {max_workers} threads")
490
+
491
  with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
492
+ # Initialize progress tracking
493
+ progress_counter = AtomicCounter()
494
+ futures = []
495
+
496
+ # Process rows in batches
497
+ for batch in fetch_all_unfixed_rows(supabase):
498
+ total_rows += len(batch)
499
+ for i, row in enumerate(batch):
500
+ future = executor.submit(process_row, row, progress_counter, total_rows, i + 1)
501
+ futures.append(future)
502
+
503
+ # Track progress with tqdm
504
+ with tqdm(total=total_rows, desc="Processing Rows", unit="row", dynamic_ncols=True) as pbar:
505
+ for future in concurrent.futures.as_completed(futures):
506
+ result = future.result()
507
+ if result['success']:
508
+ success_count += 1
509
+ # Update changes counter
510
+ for field in result['changes_made']:
511
+ changes_by_field[field] = changes_by_field.get(field, 0) + 1
512
+ else:
513
+ failure_count += 1
514
+ pbar.update(1)
515
+
516
+ # Calculate execution time
517
+ execution_time = time.time() - start_time
518
+
519
+ # Log final statistics
520
+ summary = [
521
+ "\n=== Final Statistics ===",
522
+ f"Total questions processed: {total_rows}",
523
+ f"Successful updates: {success_count}",
524
+ f"Failed updates: {failure_count}",
525
+ f"Execution time: {execution_time:.2f} seconds",
526
+ "\nChanges by field:",
527
+ *[f"- {field}: {count}" for field, count in changes_by_field.items() if count > 0],
528
+ "\n=== End of Summary ===\n"
529
+ ]
530
+
531
+ # Log to both console and summary file
532
+ for line in summary:
533
+ logger.info(line)
534
+ summary_logger.info(line)
535
 
536
  except Exception as e:
537
+ error_msg = f"An unexpected error occurred: {str(e)}"
538
+ logger.error(error_msg)
539
+ summary_logger.error(error_msg)
540
 
541
  if __name__ == "__main__":
542
  main()