SkyNait commited on
Commit
9351a05
·
1 Parent(s): c10a9aa

page handling

Browse files
__pycache__/inference_svm_model.cpython-310.pyc CHANGED
Binary files a/__pycache__/inference_svm_model.cpython-310.pyc and b/__pycache__/inference_svm_model.cpython-310.pyc differ
 
__pycache__/mineru_single.cpython-310.pyc CHANGED
Binary files a/__pycache__/mineru_single.cpython-310.pyc and b/__pycache__/mineru_single.cpython-310.pyc differ
 
__pycache__/table_row_extraction.cpython-310.pyc CHANGED
Binary files a/__pycache__/table_row_extraction.cpython-310.pyc and b/__pycache__/table_row_extraction.cpython-310.pyc differ
 
__pycache__/topic_extraction.cpython-310.pyc CHANGED
Binary files a/__pycache__/topic_extraction.cpython-310.pyc and b/__pycache__/topic_extraction.cpython-310.pyc differ
 
__pycache__/worker.cpython-310.pyc CHANGED
Binary files a/__pycache__/worker.cpython-310.pyc and b/__pycache__/worker.cpython-310.pyc differ
 
page_range.py ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import os
3
+ import re
4
+ import json
5
+ import logging
6
+ import fitz
7
+ import requests
8
+ from statistics import mode, median
9
+
10
+ from google import genai
11
+ from google.genai import types
12
+
13
+ logging.basicConfig(level=logging.INFO)
14
+ logger = logging.getLogger(__name__)
15
+
16
+ def find_all_occurrences(pdf_bytes: bytes, search_text: str) -> list:
17
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
18
+ st_norm = re.sub(r"\s+", " ", search_text).strip()
19
+ found = []
20
+ for i in range(doc.page_count):
21
+ raw = doc[i].get_text("raw")
22
+ norm = re.sub(r"\s+", " ", raw).strip()
23
+ if st_norm in norm:
24
+ found.append(i)
25
+ doc.close()
26
+ return sorted(found)
27
+
28
+ class GeminiTopicExtractor:
29
+ def __init__(self, api_key: str = None, num_pages: int = 20):
30
+ self.api_key = api_key or os.getenv("GEMINI_API_KEY", "")
31
+ self.num_pages = num_pages
32
+
33
+ def _read_first_pages_raw(self, pdf_path: str, num_pages: int) -> str:
34
+ text_parts = []
35
+ try:
36
+ if pdf_path.startswith("http://") or pdf_path.startswith("https://"):
37
+ response = requests.get(pdf_path)
38
+ if response.status_code != 200:
39
+ logger.error("Failed to download PDF from %s. Status code: %d", pdf_path, response.status_code)
40
+ return ""
41
+ pdf_bytes = response.content
42
+ else:
43
+ with open(pdf_path, "rb") as f:
44
+ pdf_bytes = f.read()
45
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
46
+ pages_to_read = min(num_pages, doc.page_count)
47
+ for i in range(pages_to_read):
48
+ raw_text = doc[i].get_text("raw")
49
+ text_parts.append(raw_text)
50
+ doc.close()
51
+ except Exception as e:
52
+ logger.error(f"Could not open PDF: {e}")
53
+ return "\n".join(text_parts)
54
+
55
+ def extract_subtopics(self, pdf_path: str) -> dict:
56
+ first_pages_text = self._read_first_pages_raw(pdf_path, self.num_pages)
57
+ if not first_pages_text.strip():
58
+ logger.error("No text from first pages => cannot extract subtopics.")
59
+ return {}
60
+ prompt = f"""
61
+ You have the first pages of a PDF specification, including a table of contents.
62
+ Instructions:
63
+ 1. Identify the 'Contents' section listing all topics, subtopics, and their corresponding pages.
64
+ 2. Identify the major academic subtopics (common desired topic names "Paper X", "Theme X", "Content of X", "AS Unit X", "A2 Unit X", or similar headings).
65
+ 3. For each subtopic, give the range of pages [start_page, end_page -1] (1-based) from the table of contents.
66
+ 4. Output only valid JSON of the form:
67
+ {{
68
+ "Subtopic A": [start_page, end_page],
69
+ "Subtopic B": [start_page, end_page]
70
+ }}
71
+ 5. If you can't find any subtopics, return an empty JSON.
72
+ Important notes:
73
+ - The correct "end_page" must be the page number of the next topic or subtopic minus 1.
74
+ - The final output must be valid JSON only, with no extra text or code blocks.
75
+ Examples:
76
+ 1. Given this table of contents:
77
+ 1 Introduction – 2
78
+ Why choose Edexcel A Level Mathematics? - 2
79
+ Supporting you in planning and implementing this qualification - 3
80
+ Qualification at a glance - 5
81
+ 2 Subject content and assessment information – 7
82
+ Paper 1 and Paper 2: Pure Mathematics - 11
83
+ Paper 3: Statistics and Mechanics - 30
84
+ Assessment Objectives - 40
85
+ 3 Administration and general information – 42
86
+ Entries - 42
87
+ Access arrangements, reasonable adjustments, special consideration and malpractice - 42
88
+ Student recruitment and progression - 45
89
+
90
+ The correct output should be:
91
+ {{
92
+ "Paper 1 and Paper 2: Pure Mathematics": [11, 29],
93
+ "Paper 3: Statistics and Mechanics": [30, 38]
94
+ }}
95
+ 2. Given this table of contents:
96
+ Qualification at a glance – 1
97
+ Assessment Objectives and weightings - 4
98
+ Knowledge, skills and understanding – 5
99
+ Theme 1: Introduction to markets and market failure - 5
100
+ Theme 2: The UK economy – performance and policies - 11
101
+ Theme 3: Business behaviour and the labour market - 21
102
+ Theme 4: A global perspective - 29
103
+ Assessment – 39
104
+ Assessment summary - 39
105
+ Assessment objectives - 41
106
+ Assessment overview - 42
107
+
108
+ The correct output should be:
109
+ {{
110
+ "Theme 1: Introduction to markets and market failure": [5, 10],
111
+ "Theme 2: The UK economy – performance and policies": [11, 20],
112
+ "Theme 3: Business behaviour and the labour market": [21, 28],
113
+ "Theme 4: A global perspective": [29, 38]
114
+ }}
115
+ Now, extract topics from this text:
116
+ {first_pages_text}
117
+ """
118
+ global _GEMINI_CLIENT
119
+ if '_GEMINI_CLIENT' not in globals() or _GEMINI_CLIENT is None:
120
+ _GEMINI_CLIENT = genai.Client(api_key=self.api_key)
121
+ client = _GEMINI_CLIENT
122
+ try:
123
+ response = client.models.generate_content(
124
+ model="gemini-2.0-flash",
125
+ contents=[prompt],
126
+ config=types.GenerateContentConfig(temperature=0.0)
127
+ )
128
+ if not response or not response.text:
129
+ logger.warning("No text from LLM => returning empty subtopics.")
130
+ return {}
131
+ raw_json = response.text.strip()
132
+ cleaned = raw_json.replace("```json", "").replace("```", "")
133
+ try:
134
+ data = json.loads(cleaned)
135
+ except Exception as json_err:
136
+ logger.error(f"JSON parsing error: {json_err}")
137
+ return {}
138
+ final_dict = {}
139
+ found_sub_dict = None
140
+ for k, v in data.items():
141
+ if isinstance(v, dict):
142
+ found_sub_dict = v
143
+ break
144
+ if found_sub_dict is not None:
145
+ for subk, rng in found_sub_dict.items():
146
+ if isinstance(rng, list) and len(rng) == 2:
147
+ final_dict[subk] = rng
148
+ else:
149
+ for subk, rng in data.items():
150
+ if isinstance(rng, list) and len(rng) == 2:
151
+ final_dict[subk] = rng
152
+ return final_dict
153
+ except Exception as e:
154
+ logger.error(f"Gemini subtopic extraction error: {e}")
155
+ return {}
156
+
157
+ class TopicRangeExtractor:
158
+ def __init__(self, gemini_api_key: str):
159
+ self.gemini_api_key = gemini_api_key
160
+ self.subtopic_extractor = GeminiTopicExtractor(api_key=gemini_api_key, num_pages=20)
161
+
162
+ def process(self, pdf_path: str) -> dict:
163
+ logger.info(f"Processing PDF: {pdf_path}")
164
+ subtopics = self.subtopic_extractor.extract_subtopics(pdf_path)
165
+ logger.info(f"Gemini returned subtopics: {subtopics}")
166
+
167
+ if pdf_path.startswith("http://") or pdf_path.startswith("https://"):
168
+ response = requests.get(pdf_path)
169
+ if response.status_code != 200:
170
+ logger.error("Failed to download PDF from %s. Status code: %d", pdf_path, response.status_code)
171
+ raise Exception(f"Failed to download PDF: {pdf_path}")
172
+ pdf_bytes = response.content
173
+ logger.info("Downloaded %d bytes for pdf_url='%s'", len(pdf_bytes), pdf_path)
174
+ else:
175
+ with open(pdf_path, "rb") as f:
176
+ pdf_bytes = f.read()
177
+ logger.info("Loaded %d bytes from local file '%s'", len(pdf_bytes), pdf_path)
178
+
179
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
180
+ total_pages = doc.page_count
181
+ doc.close()
182
+
183
+ # Compute global offset and adjust subtopic ranges.
184
+ if not subtopics:
185
+ global_offset = 0
186
+ subtopics_corrected = {}
187
+ else:
188
+ offset_candidates = []
189
+ subtopics_corrected = {}
190
+ for subname, rng in subtopics.items():
191
+ if not (isinstance(rng, list) and len(rng) == 2):
192
+ continue
193
+ start_p, end_p = rng
194
+ occs = find_all_occurrences(pdf_bytes, subname)
195
+ for p in occs:
196
+ candidate = p - (start_p - 1)
197
+ if candidate > 0:
198
+ offset_candidates.append(candidate)
199
+ subtopics_corrected[subname] = rng
200
+
201
+ if offset_candidates:
202
+ try:
203
+ global_offset = mode(offset_candidates)
204
+ except Exception:
205
+ global_offset = int(median(offset_candidates))
206
+ else:
207
+ global_offset = 0
208
+ logger.info(f"Computed global offset: {global_offset}")
209
+
210
+ # Adjust ranges by applying the global offset.
211
+ adjusted_topics = {}
212
+ for subname, rng in subtopics_corrected.items():
213
+ start_p, end_p = rng
214
+ s0 = (start_p - 1) + global_offset
215
+ e0 = (end_p - 1) + global_offset
216
+ adjusted_topics[subname] = [s0, e0]
217
+
218
+ # Sort the topics by their adjusted start page.
219
+ sorted_topics = sorted(adjusted_topics.items(), key=lambda item: item[1][0])
220
+ effective_ranges = {}
221
+ # For each subtopic, if there is a next one, set its effective end to the next topic's start minus 1.
222
+ for i, (name, (start, end)) in enumerate(sorted_topics):
223
+ if i < len(sorted_topics) - 1:
224
+ next_start = sorted_topics[i+1][1][0]
225
+ effective_end = min(end, next_start - 1)
226
+ else:
227
+ effective_end = end
228
+ effective_ranges[name] = [start, effective_end]
229
+
230
+ # Build the union of pages from each effective range.
231
+ # For every topic except the last, use a half-open range to skip the boundary page.
232
+ real_pages_set = set()
233
+ for i, (name, (start, end)) in enumerate(sorted_topics):
234
+ if i < len(sorted_topics) - 1:
235
+ # End is exclusive so the boundary page (end) is skipped.
236
+ for pp in range(start, end):
237
+ if 0 <= pp < total_pages:
238
+ real_pages_set.add(pp)
239
+ else:
240
+ # For the last topic include the end page.
241
+ for pp in range(start, end + 1):
242
+ if 0 <= pp < total_pages:
243
+ real_pages_set.add(pp)
244
+ page_range = sorted(real_pages_set)
245
+
246
+ return {
247
+ "page_range": page_range
248
+ }
249
+
250
+ if __name__ == "__main__":
251
+ input_pdf = "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf"
252
+ gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
253
+ try:
254
+ extractor = TopicRangeExtractor(gemini_api_key=gemini_key)
255
+ result = extractor.process(input_pdf)
256
+ print(json.dumps(result, indent=2))
257
+ except Exception as e:
258
+ logger.error(f"Processing failed: {e}")
topic_extr.py CHANGED
@@ -1,57 +1,22 @@
1
  #!/usr/bin/env python3
2
  import os
3
- import re
4
- import gc
5
  import json
6
  import logging
 
7
  import fitz
8
- import boto3
9
- import base64
10
- import time
11
- import asyncio
12
- import tempfile
13
  import requests
14
- from io import BytesIO
15
- from typing import List, Dict, Any
16
-
17
  import torch
18
- import cv2
19
- import numpy as np
20
-
21
- from google import genai
22
- from google.genai import types
23
 
24
  from magic_pdf.data.dataset import PymuDocDataset
25
  from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
26
- from magic_pdf.data.data_reader_writer.base import DataWriter
27
- from table_row_extraction import TableExtractor
28
 
29
  logging.basicConfig(level=logging.INFO)
30
  logger = logging.getLogger(__name__)
31
- logger.setLevel(logging.INFO)
32
- file_handler = logging.FileHandler("topic_extraction.log")
33
- file_handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(name)s - %(message)s"))
34
- logger.addHandler(file_handler)
35
-
36
- _GEMINI_CLIENT = None
37
 
38
- # helper functions, also global
39
- def unify_whitespace(text: str) -> str:
40
- return re.sub(r"\s+", " ", text).strip()
41
-
42
- def find_all_occurrences(pdf_bytes: bytes, search_text: str) -> List[int]:
43
- doc = fitz.open(stream=pdf_bytes, filetype="pdf")
44
- st_norm = unify_whitespace(search_text)
45
- found = []
46
- for i in range(doc.page_count):
47
- raw = doc[i].get_text("raw")
48
- norm = unify_whitespace(raw)
49
- if st_norm in norm:
50
- found.append(i)
51
- doc.close()
52
- return sorted(found)
53
-
54
- def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> bytes:
55
  if not page_indices:
56
  raise ValueError("No page indices provided for subset creation.")
57
  doc = fitz.open(stream=original_pdf_bytes, filetype="pdf")
@@ -67,121 +32,33 @@ def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> byt
67
  doc.close()
68
  return subset_bytes
69
 
70
- def unify_topic_name(raw_title: str, children_subtopics: list) -> str:
71
- """
72
- Clean up a topic title:
73
- - Remove any trailing "continued".
74
- - If the title does not start with a number but children provide a consistent numeric prefix,
75
- then prepend that prefix.
76
- """
77
- title = raw_title.strip()
78
- # Remove trailing "continued"
79
- title = re.sub(r"\s+continued\s*$", "", title, flags=re.IGNORECASE)
80
-
81
- # If title already starts with a number, use it as is.
82
- if re.match(r"^\d+", title):
83
- return title
84
-
85
- # Otherwise, try to deduce a numeric prefix from the children.
86
- prefixes = []
87
- for child in children_subtopics:
88
- child_title = child.get("title", "").strip()
89
- m = re.match(r"^(\d+)\.", child_title)
90
- if m:
91
- prefixes.append(m.group(1))
92
- if prefixes:
93
- # If all numeric prefixes in children are the same, use that prefix.
94
- if all(p == prefixes[0] for p in prefixes):
95
- # If title is non-empty, prepend the number; otherwise, use a fallback.
96
- if title:
97
- title = f"{prefixes[0]} {title}"
98
- else:
99
- title = f"{prefixes[0]} Topic"
100
- # Optionally, handle known broken titles explicitly.
101
- if title.lower() in {"gonometry"}:
102
- # For example, if children indicate "5.X", set to "5 Trigonometry"
103
- if prefixes and prefixes[0] == "5":
104
- title = "5 Trigonometry"
105
- return title
106
-
107
- def merge_topics(subtopic_list: list) -> list:
108
  """
109
- Merge topics with an enhanced logic:
110
- 1. Clean up each topic's title using unify_topic_name.
111
- 2. Group topics by the parent's numeric prefix (if available). Topics without a numeric prefix use their title.
112
- 3. Reassign children: for each child whose title (e.g. "2.1") does not match its current parent's numeric prefix,
113
- move it to the parent with the matching prefix if available.
114
- 4. Remove duplicate children by merging contents.
115
- 5. Sort parent topics and each parent's children by their numeric ordering.
 
116
  """
117
- # First, merge topics by parent's numeric prefix.
118
- merged = {}
119
- for topic_obj in subtopic_list:
120
- raw_title = topic_obj.get("title", "")
121
- children = topic_obj.get("children", [])
122
- contents = topic_obj.get("contents", [])
123
- new_title = unify_topic_name(raw_title, children)
124
- # Extract parent's numeric prefix, if present.
125
- m = re.match(r"^(\d+)", new_title)
126
- parent_prefix = m.group(1) if m else None
127
- key = parent_prefix if parent_prefix is not None else new_title
128
-
129
- if key not in merged:
130
- merged[key] = {
131
- "title": new_title,
132
- "contents": list(contents),
133
- "children": list(children),
134
- }
135
  else:
136
- # Merge contents and children; choose the longer title.
137
- if len(new_title) > len(merged[key]["title"]):
138
- merged[key]["title"] = new_title
139
- merged[key]["contents"].extend(contents)
140
- merged[key]["children"].extend(children)
141
-
142
- # Build a lookup of merged topics by their numeric prefix.
143
- parent_lookup = merged # keys are numeric prefixes or the full title for non-numeric ones.
144
-
145
- # Reassign children to the correct parent based on their numeric prefix.
146
- for key, topic in merged.items():
147
- new_children = []
148
- for child in topic["children"]:
149
- child_title = child.get("title", "").strip()
150
- m_child = re.match(r"^(\d+)\.", child_title)
151
- if m_child:
152
- child_prefix = m_child.group(1)
153
- if key != child_prefix and child_prefix in parent_lookup:
154
- # Reassign this child to the proper parent.
155
- parent_lookup[child_prefix]["children"].append(child)
156
- continue
157
- new_children.append(child)
158
- topic["children"] = new_children
159
-
160
- # Remove duplicate children by merging their contents.
161
- for topic in merged.values():
162
- child_map = {}
163
- for child in topic["children"]:
164
- ctitle = child.get("title", "").strip()
165
- if ctitle not in child_map:
166
- child_map[ctitle] = child
167
- else:
168
- child_map[ctitle]["contents"].extend(child.get("contents", []))
169
- child_map[ctitle]["children"].extend(child.get("children", []))
170
- topic["children"] = list(child_map.values())
171
-
172
- # Sort children by full numeric order (e.g. "2.1" < "2.10" < "2.2").
173
- def parse_subtopic_num(subtitle):
174
- digits = re.findall(r"\d+", subtitle)
175
- return tuple(int(d) for d in digits) if digits else (9999,)
176
- topic["children"].sort(key=lambda ch: parse_subtopic_num(ch.get("title", "")))
177
-
178
- # Convert merged topics to a sorted list.
179
- def parse_parent_num(topic):
180
- m = re.match(r"^(\d+)", topic.get("title", ""))
181
- return int(m.group(1)) if m else 9999
182
- final_list = list(merged.values())
183
- final_list.sort(key=lambda topic: parse_parent_num(topic))
184
- return final_list
185
 
186
  class s3Writer:
187
  def __init__(self, ak: str, sk: str, bucket: str, endpoint_url: str):
@@ -195,676 +72,44 @@ class s3Writer:
195
 
196
  def write(self, path: str, data: bytes) -> None:
197
  try:
 
198
  file_obj = BytesIO(data)
199
- self.client.upload_fileobj(
200
- file_obj,
201
- self.bucket,
202
- path
203
- )
204
  logger.info(f"Uploaded to S3: {path}")
205
  except Exception as e:
206
  logger.error(f"Failed to upload to S3: {str(e)}")
207
  raise
208
 
209
- def delete(self, path: str) -> None:
210
- try:
211
- self.client.delete_object(Bucket=self.bucket, Key=path)
212
- except Exception as e:
213
- logger.error(f"Failed to delete from S3: {str(e)}")
214
- raise
215
-
216
- def preprocess_image(image_data: bytes, max_dim: int = 600, quality: int = 60) -> bytes:
217
- arr = np.frombuffer(image_data, np.uint8)
218
- img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
219
- if img is not None:
220
- h, w, _ = img.shape
221
- if max(h, w) > max_dim:
222
- scale = max_dim / float(max(h, w))
223
- new_w = int(w * scale)
224
- new_h = int(h * scale)
225
- img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
226
- encode_params = [int(cv2.IMWRITE_JPEG_QUALITY), quality]
227
- success, enc = cv2.imencode(".jpg", img, encode_params)
228
- if success:
229
- return enc.tobytes()
230
- return image_data
231
-
232
- def call_gemini_for_table_classification(image_data: bytes, api_key: str, max_retries: int = 1) -> str:
233
- """
234
- Existing Gemini call to classify an image as TWO_COLUMN, THREE_COLUMN, or NO_TABLE.
235
- """
236
- for attempt in range(max_retries + 1):
237
- try:
238
- prompt = """You are given an image. Determine if it shows a table that has exactly 2 or 3 columns.
239
- The three-column 'table' image includes such key features:
240
- - Three columns header
241
- - Headers like 'Topics', 'Content', 'Guidelines', 'Amplification', 'Additional guidance notes', 'Area of Study'
242
- - Possibly sections (e.g. 8.4, 9.1)
243
- The two-column 'table' image includes such key features:
244
- - Two columns
245
- - Headers like 'Subject content', 'Additional information'
246
- - Possibly sections (e.g. 2.1, 3.4, G2, G3, )
247
- If the image is a relevant table with 2 columns, respond with 'TWO_COLUMN'.
248
- If the image is a relevant table with 3 columns, respond with 'THREE_COLUMN'.
249
- If the image is non-empty but does not show a table, respond with 'NO_TABLE'.
250
- Return only one of these exact labels.
251
- """
252
- global _GEMINI_CLIENT
253
- if _GEMINI_CLIENT is None:
254
- _GEMINI_CLIENT = genai.Client(api_key=api_key)
255
- client = _GEMINI_CLIENT
256
-
257
- resp = client.models.generate_content(
258
- model="gemini-2.0-flash",
259
- contents=[
260
- {
261
- "parts": [
262
- {"text": prompt},
263
- {
264
- "inline_data": {
265
- "mime_type": "image/jpeg",
266
- "data": base64.b64encode(image_data).decode('utf-8')
267
- }
268
- }
269
- ]
270
- }
271
- ],
272
- config=types.GenerateContentConfig(temperature=0.0)
273
- )
274
- if resp and resp.text:
275
- classification = resp.text.strip().upper()
276
- if "THREE" in classification:
277
- return "THREE_COLUMN"
278
- elif "TWO" in classification:
279
- return "TWO_COLUMN"
280
- elif "EMPTY" in classification:
281
- return "EMPTY_IMAGE"
282
- return "NO_TABLE"
283
- except Exception as e:
284
- logger.error(f"Gemini table classification error: {e}")
285
- if "503" in str(e):
286
- return "NO_TABLE"
287
- if attempt < max_retries:
288
- time.sleep(0.5)
289
- else:
290
- return "NO_TABLE"
291
-
292
- async def classify_image_async(image_data: bytes, api_key: str, max_retries: int = 1) -> str:
293
- loop = asyncio.get_event_loop()
294
- preprocessed = preprocess_image(image_data)
295
- return await loop.run_in_executor(None, call_gemini_for_table_classification, preprocessed, api_key, max_retries)
296
-
297
- def call_gemini_for_subtopic_identification_image(image_data: bytes, api_key: str, max_retries: int = 1) -> dict:
298
- for attempt in range(max_retries + 1):
299
- try:
300
- prompt = """
301
- You are given an image from an educational curriculum specification for Gemini Flash 2. The image may contain:
302
- 1) A main topic heading in the format: "<number> <Topic Name>", for example "2 Algebra and functions continued".
303
- 2) A subtopic heading in the format "<number>.<number>" or "<number>.<number>.<number>", for example "2.5", "2.6", "3.4", "2.1.1", "4.3.3" or "1.2.1".
304
- 3) A label-like title in the left column of a two-column table, for example "G2", "G3", "Scarcity, choice and opportunity cost", or similar text without explicit numeric patterns (2.1, 3.4, etc.).
305
- 4) Possibly no relevant text or only truncated text (e.g. "Topics", "Subject content", "What students need to learn", "Content Amplification Additional guidance notes", etc.).
306
-
307
- Your task is to extract:
308
- - **"title"**: A recognized main topic or heading text.
309
- - **"subtopics"**: Any recognized subtopic numbers (e.g. "2.5", "2.6", "3.4", "G2", "2.1.1", "4.1.1"), as an array of strings.
310
-
311
- Follow these rules:
312
-
313
- (1) **If the cell shows a main topic in the format "<number> <Topic Name>",** for example "2 Algebra and functions continued":
314
- - Remove the word "continued" if present.
315
- - Put that resulting text in "title". (e.g. "2 Algebra and functions")
316
- - "subtopics" should be an empty array, unless smaller subtopic numbers (e.g. "2.5") are also detected in the same text.
317
-
318
- (2) **If the cell shows one or more subtopic numbers** in the format "<number>.<number>", for example "2.5", "2.6", or "3.4":
319
- - Collect those exact strings in the JSON key "subtopics" (an array of strings).
320
- - "title" in this case should be an empty string if you only detect subtopics.
321
- (Example: If text is "2.5 Solve linear inequalities...", then "title" = "", "subtopics" = ["2.5"]).
322
-
323
- (3) **If no main topic or subtopic is detected but the text appears to be a heading**, for example "Specialisation, division of labour and exchange", then:
324
- - Return:
325
- {
326
- "title": "<the heading text>",
327
- "subtopics": []
328
- }
329
-
330
- (4) **If there is no numeric value in the left column** (e.g. "2.1" or "2 <Topic name>" not found) but the left column text appears to be a heading (for instance "Scarcity, choice and opportunity cost"), then:
331
- - Use that left column text as "title".
332
- - "subtopics" remains empty.
333
- Example:
334
- If the left column is "Scarcity, choice and opportunity cost" and the right column has definitions, your output is:
335
- {
336
- "title": "Scarcity, choice and opportunity cost",
337
- "subtopics": []
338
- }
339
-
340
- (5) **If there is no numeric value in the left column** (e.g. "2.1" or "2 <Topic name>" not found) or it appears to be a standalone column with text, treat it as a heading.
341
- - "subtopics" remains empty.
342
- Example:
343
- If there is only one column image that is "Specialisation, devision of labour and exchange" and the right column is not present, your output is:
344
- {
345
- "title": "Specialisation, devision of labour and exchange",
346
- "subtopics": []
347
- }
348
-
349
- (6) **If there is a character + digit pattern** in the left column of a two-column table (for example "G2", "G3", "G4", "C1"), treat that as a topic-like label:
350
- - Put that label text into "title" (e.g. "G2").
351
- - "subtopics" remains empty unless you also see actual subtopic formats like "2.5", "3.4" inside the same cell.
352
-
353
- (7) **Output must be valid JSON** in this exact structure, with no extra text or explanation:
354
- {
355
- "title": "...",
356
- "subtopics": [...]
357
- }
358
-
359
- (8) **If the image is blank or truncated**, defined as:
360
- - Contains no words at all (e.g. a blank white or black image), **OR**
361
- - Contains only snippet words/phrases such as "Topics", "Subject content", "Content Amplification Additional guidance notes", "What students need to learn" (including variations in background color), **OR**
362
- - Contains partial headings with no recognizable numeric or textual headings
363
- - Contains partial UI labels only, such as “Topics” in a gray bar or “What students need to learn” in a blue bar, with no additional meaningful text.
364
- then return:
365
- {
366
- "title": "EMPTY_IMAGE",
367
- "subtopics": []
368
- }
369
-
370
- (9) **If you cannot recognize any text matching the patterns above**, or the text is too partial/truncated to form a valid heading, also return:
371
- {
372
- "title": "EMPTY_IMAGE",
373
- "subtopics": []
374
- }
375
-
376
- **Examples**:
377
-
378
- - If the image text is "2 Algebra and functions continued", return:
379
- {
380
- "title": "2 Algebra and functions",
381
- "subtopics": []
382
- }
383
-
384
- - If the image text is "2.5 Solve linear and quadratic inequalities ...", return:
385
- {
386
- "title": "",
387
- "subtopics": ["2.5"]
388
- }
389
-
390
- - If the image text is "Specialisation, division of labour and exchange" (with no numeric patterns at all), return:
391
- {
392
- "title": "Specialisation, division of labour and exchange",
393
- "subtopics": []
394
- }
395
-
396
- - If the left column says "G2" and the right column has details, but no subtopic numbers, return:
397
- {
398
- "title": "G2",
399
- "subtopics": []
400
- }
401
-
402
- - If the image is blank or shows only partial/truncated snippet words (e.g. "Topics", "Content Amplification Additional guidance notes", "Subject content", "What students need to learn") and nothing else, return:
403
- {
404
- "title": "EMPTY_IMAGE",
405
- "subtopics": []
406
- }
407
- """
408
- global _GEMINI_CLIENT
409
- if _GEMINI_CLIENT is None:
410
- _GEMINI_CLIENT = genai.Client(api_key=api_key)
411
- client = _GEMINI_CLIENT
412
-
413
- resp = client.models.generate_content(
414
- model="gemini-2.0-flash",
415
- contents=[
416
- {
417
- "parts": [
418
- {"text": prompt},
419
- {
420
- "inline_data": {
421
- "mime_type": "image/jpeg",
422
- "data": base64.b64encode(image_data).decode("utf-8")
423
- }
424
- }
425
- ]
426
- }
427
- ],
428
- config=types.GenerateContentConfig(temperature=0.0)
429
- )
430
-
431
- if not resp or not resp.text:
432
- logger.warning("Gemini returned an empty response for subtopic extraction.")
433
- return {"title": "", "subtopics": []}
434
-
435
- raw = resp.text.strip()
436
- # Remove any markdown fences if present
437
- raw = raw.replace("```json", "").replace("```", "").strip()
438
- data = json.loads(raw)
439
-
440
- title = data.get("title", "")
441
- subtopics = data.get("subtopics", [])
442
- if title.upper() == "EMPTY_IMAGE":
443
- return {"title": "EMPTY_IMAGE", "subtopics": []}
444
- if not isinstance(subtopics, list):
445
- subtopics = []
446
- return {"title": title, "subtopics": subtopics}
447
-
448
- except Exception as e:
449
- logger.error(f"Gemini subtopic identification error on attempt {attempt}: {e}")
450
- if attempt < max_retries:
451
- time.sleep(0.5)
452
- else:
453
- return {"title": "", "subtopics": []}
454
-
455
- return {"title": "", "subtopics": []}
456
-
457
- class S3ImageWriter(DataWriter):
458
  def __init__(self, s3_writer: s3Writer, base_path: str, gemini_api_key: str):
459
  self.s3_writer = s3_writer
460
  self.base_path = base_path if base_path.endswith("/") else base_path + "/"
461
  self.gemini_api_key = gemini_api_key
462
  self.descriptions = {}
463
- self._img_count = 0
464
- self.extracted_tables = {}
465
-
466
- self.extracted_subtopics = {}
467
 
468
  def write(self, path: str, data: bytes) -> None:
469
- self._img_count += 1
470
- unique_id = f"img_{self._img_count}.jpg"
471
- s3_key = f"{self.base_path}{unique_id}"
472
- self.s3_writer.write(s3_key, data)
473
  self.descriptions[path] = {
474
  "data": data,
475
- "s3_path": s3_key,
476
- "table_classification": "NO_TABLE",
477
- "final_alt": ""
478
- }
479
-
480
- async def post_process_async(self, key: str, md_content: str) -> str:
481
- logger.info("Classifying images to detect tables.")
482
- tasks = {
483
- p: asyncio.create_task(classify_image_async(info["data"], self.gemini_api_key))
484
- for p, info in self.descriptions.items()
485
  }
486
- results = await asyncio.gather(*tasks.values(), return_exceptions=True)
487
- for p, result in zip(list(self.descriptions.keys()), results):
488
- if isinstance(result, Exception):
489
- logger.error(f"Table classification error for {p}: {result}")
490
- self.descriptions[p]['table_classification'] = "NO_TABLE"
491
- else:
492
- self.descriptions[p]['table_classification'] = result
493
-
494
- # Process each image description.
495
- for p, info in list(self.descriptions.items()):
496
- cls = info['table_classification']
497
- if cls == "TWO_COLUMN":
498
- info['final_alt'] = "HAS TO BE PROCESSED - two column table"
499
- elif cls == "THREE_COLUMN":
500
- info['final_alt'] = "HAS TO BE PROCESSED - three column table"
501
- elif cls == "EMPTY_IMAGE":
502
- md_content = md_content.replace(f"![]({key}{p})", "")
503
- try:
504
- self.s3_writer.delete(info['s3_path'])
505
- except Exception as e:
506
- logger.error(f"Error deleting S3 object {info['s3_path']}: {e}")
507
- del self.descriptions[p]
508
- continue
509
- else:
510
- info['final_alt'] = "NO_TABLE image"
511
- md_content = md_content.replace(f"![]({key}{p})", f"![{info['final_alt']}]({info['s3_path']})")
512
-
513
- md_content = await self._process_table_images_in_markdown(key, md_content)
514
-
515
- # Filter final lines to keep only lines with images.
516
- final_lines = [
517
- line.strip() for line in md_content.split("\n")
518
- if re.match(r"^\!\[.*\]\(.*\)", line.strip())
519
- ]
520
- return "\n".join(final_lines)
521
-
522
- async def _process_table_images_in_markdown(self, key: str, md_content: str) -> str:
523
- pat = r"!\[HAS TO BE PROCESSED - (two|three) column table\]\(([^)]+)\)"
524
- matches = re.findall(pat, md_content, flags=re.IGNORECASE)
525
- if not matches:
526
- return md_content
527
-
528
- for (col_type, s3_key) in matches:
529
- logger.info(f"Processing table image: {s3_key}, columns={col_type}")
530
- img_data = None
531
- for desc in self.descriptions.values():
532
- if desc.get("s3_path") == s3_key:
533
- img_data = desc.get("data")
534
- break
535
- if img_data is None:
536
- logger.warning(f"No image data found for S3 key {s3_key}. Skipping.")
537
- continue
538
-
539
- # Write temporary file for processing.
540
- with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file:
541
- temp_file.write(img_data)
542
- temp_path = temp_file.name
543
-
544
- try:
545
- if col_type.lower() == 'two':
546
- extractor = TableExtractor(
547
- skip_header=True,
548
- merge_two_col_rows=True,
549
- enable_subtopic_merge=True,
550
- subtopic_threshold=0.2
551
- )
552
- else:
553
- extractor = TableExtractor(
554
- skip_header=True,
555
- merge_two_col_rows=False,
556
- enable_subtopic_merge=False,
557
- subtopic_threshold=0.2
558
- )
559
- row_boxes = extractor.process_image(temp_path)
560
- out_folder = temp_path + "_rows"
561
- os.makedirs(out_folder, exist_ok=True)
562
- extractor.save_extracted_cells(temp_path, row_boxes, out_folder)
563
-
564
- #Group cells by row using file name pattern
565
- recognized_main_topic = ""
566
- main_topic_image_key = None
567
- recognized_subtopics = []
568
- header_found = False
569
- header_row_index = None
570
-
571
- # Loop through each row of extracted cells
572
- for i, row in enumerate(row_boxes):
573
- row_dir = os.path.join(out_folder, f"row_{i}")
574
- valid_info = None
575
- valid_cell_key = None
576
- for j in range(len(row)):
577
- cell_path = os.path.join(row_dir, f"col_{j}.png")
578
- if not os.path.isfile(cell_path):
579
- alternative_path = os.path.join(row_dir, f"col_{j}.jpg")
580
- if os.path.isfile(alternative_path):
581
- cell_path = alternative_path
582
- else:
583
- logger.warning(f"Cell image not found: {cell_path}")
584
- continue
585
- with open(cell_path, "rb") as cf:
586
- cell_image_data = cf.read()
587
- cell_key = f"{self.base_path}cells/{os.path.basename(s3_key)}_r{i}_c{j}.png"
588
- self.s3_writer.write(cell_key, cell_image_data)
589
- info = call_gemini_for_subtopic_identification_image(cell_image_data, self.gemini_api_key)
590
- if info.get("title", "").upper() == "EMPTY_IMAGE":
591
- try:
592
- self.s3_writer.delete(cell_key)
593
- logger.info(f"Deleted empty cell image from S3: {cell_key}")
594
- except Exception as e:
595
- logger.error(f"Error deleting empty cell image {cell_key}: {e}")
596
- continue
597
- valid_info = info
598
- valid_cell_key = cell_key
599
- break # Use only the first valid cell in this row
600
-
601
- if valid_info is None:
602
- continue
603
-
604
- # First valid row becomes header row.
605
- if not header_found:
606
- header_found = True
607
- header_row_index = i
608
- recognized_main_topic = valid_info.get("title", "")
609
- main_topic_image_key = valid_cell_key
610
- # The row immediately following the header is used for subtopic children.
611
- elif i == header_row_index + 1:
612
- for st in valid_info.get("subtopics", []):
613
- recognized_subtopics.append({
614
- "title": st,
615
- "contents": [{"type": "image", "key": valid_cell_key}],
616
- "children": []
617
- })
618
- else:
619
- # Ignore further rows
620
- continue
621
-
622
- final_json = {
623
- "title": recognized_main_topic,
624
- "contents": [],
625
- "children": recognized_subtopics
626
- }
627
- if main_topic_image_key:
628
- final_json["contents"].append({"type": "image", "key": main_topic_image_key})
629
-
630
- # Save the final JSON.
631
- self.extracted_subtopics[s3_key] = final_json
632
-
633
- # Create a snippet to replace the markdown line.
634
- snippet = ["**Extracted table cells:**"]
635
- if main_topic_image_key:
636
- snippet.append(f"![Header]({main_topic_image_key})")
637
- for child in recognized_subtopics:
638
- for content in child.get("contents", []):
639
- snippet.append(f"![Child]({content.get('key')})")
640
- new_snip = "\n".join(snippet)
641
- old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({s3_key})"
642
- md_content = md_content.replace(old_line, new_snip)
643
-
644
- except Exception as e:
645
- logger.error(f"Error processing table image {s3_key}: {e}")
646
- finally:
647
- os.remove(temp_path)
648
-
649
- return md_content
650
 
651
  def post_process(self, key: str, md_content: str) -> str:
652
- return asyncio.run(self.post_process_async(key, md_content))
653
-
654
- class GeminiTopicExtractor:
655
- def __init__(self, api_key: str = None, num_pages: int = 14):
656
- self.api_key = api_key or os.getenv("GEMINI_API_KEY", "")
657
- self.num_pages = num_pages
658
-
659
- def extract_subtopics(self, pdf_path: str) -> Dict[str, List[int]]:
660
- first_pages_text = self._read_first_pages_raw(pdf_path, self.num_pages)
661
- if not first_pages_text.strip():
662
- logger.error("No text from first pages => cannot extract subtopics.")
663
- return {}
664
- prompt = f"""
665
- You have the first pages of a PDF specification, including a table of contents.
666
- Instructions:
667
- 1. Identify the 'Contents' section listing all topics, subtopics, and their corresponding pages.
668
- 2. Identify the major academic subtopics (common desired topic names "Paper X", "Theme X", "Content of X", "AS Unit X", "A2 Unit X", or similar headings).
669
- 3. For each subtopic, give the range of pages [start_page, end_page] (1-based) from the table of contents.
670
- 4. Output only valid JSON of the form:
671
- {{
672
- "Subtopic A": [start_page, end_page],
673
- "Subtopic B": [start_page, end_page]
674
- }}
675
- 5. If you can't find any subtopics, return an empty JSON.
676
- Important notes:
677
- - The correct "end_page" must be the page number of the next topic or subtopic minus 1.
678
- - The final output must be valid JSON only, with no extra text or code blocks.
679
- Examples:
680
- 1. Given this table of contents:
681
- 1 Introduction – 2
682
- Why choose Edexcel A Level Mathematics? - 2
683
- Supporting you in planning and implementing this qualification - 3
684
- Qualification at a glance - 5
685
- 2 Subject content and assessment information – 7
686
- Paper 1 and Paper 2: Pure Mathematics - 11
687
- Paper 3: Statistics and Mechanics - 30
688
- Assessment Objectives - 40
689
- 3 Administration and general information – 42
690
- Entries - 42
691
- Access arrangements, reasonable adjustments, special consideration and malpractice - 42
692
- Student recruitment and progression - 45
693
- Appendix 1: Formulae – 49
694
- Appendix 2: Notation – 53
695
- Appendix 3: Use of calculators – 59
696
- Appendix 4: Assessment Objectives – 60
697
- Appendix 5: The context for the development of this qualification – 62
698
- Appendix 6: Transferable skills – 64
699
- Appendix 7: Level 3 Extended Project qualification – 65
700
- Appendix 8: Codes – 67
701
- The correct output should be:
702
- {{
703
- "Paper 1 and Paper 2: Pure Mathematics": [11, 29],
704
- "Paper 3: Statistics and Mechanics": [30, 42]
705
- }}
706
- 2. Given this table of contents:
707
- Qualification at a glance – 1
708
- Assessment Objectives and weightings - 4
709
- Knowledge, skills and understanding – 5
710
- Theme 1: Introduction to markets and market failure - 5
711
- Theme 2: The UK economy – performance and policies - 11
712
- Theme 3: Business behaviour and the labour market - 21
713
- Theme 4: A global perspective - 29
714
- Assessment – 39
715
- Assessment summary - 39
716
- Assessment objectives - 41
717
- Assessment overview - 42
718
- Breakdown of assessment objectives - 42
719
- Synoptic assessment - 43
720
- Discount code and performance tables - 43
721
- Access arrangements, reasonable adjustments and special consideration - 44
722
- Malpractice - 45
723
- Equality Act 2010 and Pearson equality policy - 45
724
- Synoptic assessment - 46
725
- Awarding and reporting - 47
726
- Other information – 49
727
- Student recruitment -49
728
- Prior learning and other requirements -49
729
- Progression - 49
730
- Appendix 1: Transferable skills – 53
731
- Appendix 2: Level 3 Extended Project qualification – 55
732
- Appendix 3: Quantitative skills – 59
733
- Appendix 4: Codes – 61
734
- Appendix 5: Index – 63
735
- The correct output should be:
736
- {{
737
- "Theme 1: Introduction to markets and market failure": [5, 10],
738
- "Theme 2: The UK economy – performance and policies": [11, 20],
739
- "Theme 3: Business behaviour and the labour market": [21, 28],
740
- "Theme 4: A global perspective": [29, 38]
741
- }}
742
- 3. You might also see sections like:
743
- 2.1 AS Unit 1 11
744
- 2.2 AS Unit 2 18
745
- 2.3 A2 Unit 3 24
746
- 2.4 A2 Unit 4 31
747
- In that scenario, your output might look like:
748
- {{
749
- "2.1 AS Unit 1": [11, 17],
750
- "2.2 AS Unit 2": [18, 23],
751
- "2.3 A2 Unit 3": [24, 30],
752
- "2.4 A2 Unit 4": [31, 35]
753
- }}
754
- or
755
- 2.1 AS units 6
756
- 2.2 AS units 23
757
- In that scenario, your output might look like:
758
- {{
759
- "2.1 AS Unit 1": [6, 2],
760
- "2.2 AS Unit 2": [23, 43]
761
- }}
762
-
763
- 4. Another example might list subtopics:
764
- 3.1 Overarching themes 11
765
- 3.2 A: Proof 12
766
- 3.3 B: Algebra and functions 13
767
- 3.4 C: Coordinate geometry in the ( x , y ) plane 14
768
- 3.5 D: Sequences and series 15
769
- 3.6 E: Trigonometry 16
770
- 3.7 F: Exponentials and logarithms 17
771
- 3.8 G: Differentiation 18
772
- 3.9 H: Integration 19
773
- 3.10 I: Numerical methods 20
774
- 3.11 J: Vectors 20
775
- 3.12 K: Statistical sampling 21
776
- 3.13 L: Data presentation and interpretation 21
777
- 3.14 M: Probability 22
778
- 3.15 N: Statistical distributions 23
779
- 3.16 O: Statistical hypothesis testing 23
780
- 3.17 P: Quantities and units in mechanics 24
781
- 3.18 Q: Kinematics 24
782
- 3.19 R: Forces and Newton’s laws 24
783
- 3.20 S: Moments 25
784
- 3.21 Use of data in statistics 26
785
- Here the correct output might look like:
786
- {{
787
- "A: Proof": [12, 12],
788
- "B: Algebra and functions": [13, 13],
789
- ...
790
- }}
791
- Now, extract topics from this text:
792
- {first_pages_text}
793
- """
794
- global _GEMINI_CLIENT
795
- if _GEMINI_CLIENT is None:
796
- _GEMINI_CLIENT = genai.Client(api_key=self.api_key)
797
- client = _GEMINI_CLIENT
798
- try:
799
- response = client.models.generate_content(
800
- model="gemini-2.0-flash",
801
- contents=[prompt],
802
- config=types.GenerateContentConfig(temperature=0.0)
803
- )
804
- if not response or not response.text:
805
- logger.warning("No text from LLM => returning empty subtopics.")
806
- return {}
807
- raw_json = response.text.strip()
808
- cleaned = raw_json.replace("```json", "").replace("```", "")
809
- try:
810
- data = json.loads(cleaned)
811
- except Exception as json_err:
812
- logger.error(f"JSON parsing error: {json_err}")
813
- return {}
814
- final_dict = {}
815
- found_sub_dict = None
816
- for k, v in data.items():
817
- if isinstance(v, dict):
818
- found_sub_dict = v
819
- break
820
- if found_sub_dict is not None:
821
- for subk, rng in found_sub_dict.items():
822
- if isinstance(rng, list) and len(rng) == 2:
823
- final_dict[subk] = rng
824
- else:
825
- for subk, rng in data.items():
826
- if isinstance(rng, list) and len(rng) == 2:
827
- final_dict[subk] = rng
828
- return final_dict
829
- except Exception as e:
830
- logger.error(f"Gemini subtopic extraction error: {e}")
831
- return {}
832
-
833
- def _read_first_pages_raw(self, pdf_path: str, num_pages: int) -> str:
834
- text_parts = []
835
- try:
836
- if pdf_path.startswith("http://") or pdf_path.startswith("https://"):
837
- response = requests.get(pdf_path)
838
- if response.status_code != 200:
839
- logger.error("Failed to download PDF from %s. Status code: %d", pdf_path, response.status_code)
840
- return ""
841
- pdf_bytes = response.content
842
- else:
843
- with open(pdf_path, "rb") as f:
844
- pdf_bytes = f.read()
845
- doc = fitz.open(stream=pdf_bytes, filetype="pdf")
846
- pages_to_read = min(num_pages, doc.page_count)
847
- for i in range(pages_to_read):
848
- raw_text = doc[i].get_text("raw")
849
- text_parts.append(raw_text)
850
- doc.close()
851
- except Exception as e:
852
- logger.error(f"Could not open PDF: {e}")
853
- return "\n".join(text_parts)
854
 
855
- class MineruNoTextProcessor:
856
- def __init__(self, output_folder: str, gemini_api_key: str):
 
857
  self.output_folder = output_folder
858
  os.makedirs(self.output_folder, exist_ok=True)
859
  self.layout_model = "doclayout_yolo"
860
  self.formula_enable = True
861
  self.table_enable = False
862
  self.language = "en"
863
-
864
- self.subtopic_extractor = GeminiTopicExtractor(api_key=gemini_api_key, num_pages=20)
865
- self.gemini_api_key = gemini_api_key or os.getenv("GEMINI_API_KEY", "")
866
-
867
- self.use_s3 = True
868
  self.s3_writer = s3Writer(
869
  ak=os.getenv("S3_ACCESS_KEY"),
870
  sk=os.getenv("S3_SECRET_KEY"),
@@ -880,110 +125,106 @@ class MineruNoTextProcessor:
880
  except Exception as e:
881
  logger.error(f"Error during GPU cleanup: {e}")
882
 
883
- def process(self, pdf_path: str) -> Dict[str, Any]:
884
- logger.info(f"Processing PDF: {pdf_path}")
885
- try:
886
- subtopics = self.subtopic_extractor.extract_subtopics(pdf_path)
887
- logger.info(f"Gemini returned subtopics: {subtopics}")
888
-
889
- if pdf_path.startswith("http://") or pdf_path.startswith("https://"):
890
- response = requests.get(pdf_path)
891
- if response.status_code != 200:
892
- logger.error("Failed to download PDF from %s. Status code: %d", pdf_path, response.status_code)
893
- raise Exception(f"Failed to download PDF: {pdf_path}")
894
- pdf_bytes = response.content
895
- logger.info("Downloaded %d bytes for pdf_url='%s'", len(pdf_bytes), pdf_path)
896
- else:
897
- with open(pdf_path, "rb") as f:
898
- pdf_bytes = f.read()
899
- logger.info("Loaded %d bytes from local file '%s'", len(pdf_bytes), pdf_path)
900
-
901
- doc = fitz.open(stream=pdf_bytes, filetype="pdf")
902
- total_pages = doc.page_count
903
- doc.close()
904
-
905
- # Decide which pages to process
906
- final_pages = set()
907
- if not subtopics:
908
- # fallback
909
- final_pages = set(range(total_pages))
910
- else:
911
- offset_candidates = []
912
- for subname, rng in subtopics.items():
913
- start_p, _ = rng
914
- occs = find_all_occurrences(pdf_bytes, subname)
915
- for p in occs:
916
- candidate = p - (start_p - 1)
917
- if candidate > 0:
918
- offset_candidates.append(candidate)
919
- if offset_candidates:
920
- try:
921
- from statistics import mode
922
- global_offset = mode(offset_candidates)
923
- except:
924
- from statistics import median
925
- global_offset = int(median(offset_candidates))
926
- else:
927
- global_offset = 0
928
-
929
- logger.info(f"Computed global offset: {global_offset}")
930
- for subname, rng in subtopics.items():
931
- if not (isinstance(rng, list) and len(rng) == 2):
932
- continue
933
- start_p, end_p = rng
934
- if start_p > end_p:
935
- continue
936
- s0 = (start_p - 1) + global_offset
937
- e0 = (end_p - 1) + global_offset
938
- for pp in range(s0, e0 + 1):
939
- final_pages.add(pp)
940
-
941
- if not final_pages:
942
- final_pages = set(range(total_pages))
943
-
944
- logger.info(f"Processing pages (0-based): {sorted(final_pages)}")
945
- subset_pdf_bytes = create_subset_pdf(pdf_bytes, sorted(final_pages))
946
-
947
- # 4) Analyze and produce markdown
948
- dataset = PymuDocDataset(subset_pdf_bytes)
949
- inference = doc_analyze(
950
- dataset,
951
- ocr=True,
952
- lang=self.language,
953
- layout_model=self.layout_model,
954
- formula_enable=self.formula_enable,
955
- table_enable=self.table_enable
956
- )
957
- # S3
958
- writer = S3ImageWriter(self.s3_writer, "/topic-extraction", self.gemini_api_key)
959
-
960
- md_prefix = "/topic-extraction/"
961
- pipe_result = inference.pipe_ocr_mode(writer, lang=self.language)
962
- md_content = pipe_result.get_markdown(md_prefix)
963
- final_markdown = writer.post_process(md_prefix, md_content)
964
-
965
- subtopic_list = list(writer.extracted_subtopics.values())
966
- subtopic_list = merge_topics(subtopic_list)
967
-
968
- out_path = os.path.join(self.output_folder, "_subtopics.json")
969
- with open(out_path, "w", encoding="utf-8") as f:
970
- json.dump(subtopic_list, f, indent=2)
971
- logger.info(f"Final subtopics JSON saved locally at {out_path}")
972
 
973
- return {
974
- "final_markdown": final_markdown,
975
- "subtopics_extracted": subtopic_list
976
- }
977
- finally:
978
- self.cleanup_gpu()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
979
 
980
  if __name__ == "__main__":
981
- input_pdf = "/home/user/app/input_output/wjec-gce-as-a-economics-specification-from-2015.pdf"
982
- output_dir = "/home/user/app/pearson_json"
983
- gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
984
- try:
985
- processor = MineruNoTextProcessor(output_folder=output_dir, gemini_api_key=gemini_key)
986
- result = processor.process(input_pdf)
987
- logger.info("Processing completed successfully.")
988
- except Exception as e:
989
- logger.error(f"Processing failed: {e}")
 
1
  #!/usr/bin/env python3
2
  import os
3
+ import sys
 
4
  import json
5
  import logging
6
+ import gc
7
  import fitz
 
 
 
 
 
8
  import requests
 
 
 
9
  import torch
10
+ import boto3
11
+ import re
 
 
 
12
 
13
  from magic_pdf.data.dataset import PymuDocDataset
14
  from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
 
 
15
 
16
  logging.basicConfig(level=logging.INFO)
17
  logger = logging.getLogger(__name__)
 
 
 
 
 
 
18
 
19
+ def create_subset_pdf(original_pdf_bytes: bytes, page_indices: list) -> bytes:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  if not page_indices:
21
  raise ValueError("No page indices provided for subset creation.")
22
  doc = fitz.open(stream=original_pdf_bytes, filetype="pdf")
 
32
  doc.close()
33
  return subset_bytes
34
 
35
+ def parse_page_range(page_field) -> list:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  """
37
+ Parse the 'page' field from the JSON input.
38
+ It can be either:
39
+ a list of integers:
40
+ - If the list contains exactly two integers, treat them as a range [start, end] (inclusive start, exclusive end).
41
+ - Otherwise, treat the list as a sequence of individual pages.
42
+ a string:
43
+ - Either a comma-separated range "start, end" or a comma-separated list of pages.
44
+ The numbers are assumed to be 1-indexed and are converted to 0-indexed.
45
  """
46
+ if isinstance(page_field, list):
47
+ if len(page_field) == 2:
48
+ start, end = page_field
49
+ return list(range(start - 1, end))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  else:
51
+ return [int(p) - 1 for p in page_field]
52
+ elif isinstance(page_field, str):
53
+ parts = [p.strip() for p in page_field.split(',')]
54
+ if len(parts) == 2:
55
+ start, end = int(parts[0]), int(parts[1])
56
+ return list(range(start - 1, end))
57
+ else:
58
+ return [int(p) - 1 for p in parts]
59
+ else:
60
+ logger.error("Invalid type for page field. Must be list or string.")
61
+ raise ValueError("Invalid page field type.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
  class s3Writer:
64
  def __init__(self, ak: str, sk: str, bucket: str, endpoint_url: str):
 
72
 
73
  def write(self, path: str, data: bytes) -> None:
74
  try:
75
+ from io import BytesIO
76
  file_obj = BytesIO(data)
77
+ self.client.upload_fileobj(file_obj, self.bucket, path)
 
 
 
 
78
  logger.info(f"Uploaded to S3: {path}")
79
  except Exception as e:
80
  logger.error(f"Failed to upload to S3: {str(e)}")
81
  raise
82
 
83
+ class S3ImageWriter:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  def __init__(self, s3_writer: s3Writer, base_path: str, gemini_api_key: str):
85
  self.s3_writer = s3_writer
86
  self.base_path = base_path if base_path.endswith("/") else base_path + "/"
87
  self.gemini_api_key = gemini_api_key
88
  self.descriptions = {}
 
 
 
 
89
 
90
  def write(self, path: str, data: bytes) -> None:
91
+ full_path = f"{self.base_path}{os.path.basename(path)}"
92
+ self.s3_writer.write(full_path, data)
 
 
93
  self.descriptions[path] = {
94
  "data": data,
95
+ "s3_path": full_path
 
 
 
 
 
 
 
 
 
96
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
  def post_process(self, key: str, md_content: str) -> str:
99
+ for path, info in self.descriptions.items():
100
+ s3_path = info.get("s3_path")
101
+ md_content = md_content.replace(f"![]({key}{path})", f"![]({s3_path})")
102
+ return md_content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
+ class TopicExtractionProcessor:
105
+ def __init__(self, gemini_api_key: str, s3_config: dict, output_folder: str):
106
+ self.gemini_api_key = gemini_api_key
107
  self.output_folder = output_folder
108
  os.makedirs(self.output_folder, exist_ok=True)
109
  self.layout_model = "doclayout_yolo"
110
  self.formula_enable = True
111
  self.table_enable = False
112
  self.language = "en"
 
 
 
 
 
113
  self.s3_writer = s3Writer(
114
  ak=os.getenv("S3_ACCESS_KEY"),
115
  sk=os.getenv("S3_SECRET_KEY"),
 
125
  except Exception as e:
126
  logger.error(f"Error during GPU cleanup: {e}")
127
 
128
+ def process_input_file(self, input_file: dict) -> str:
129
+ key = input_file.get("key", "")
130
+ url = input_file.get("url", "")
131
+ page_field = input_file.get("page")
132
+ if not url or not page_field:
133
+ logger.error("Input file must contain 'url' and 'page' fields.")
134
+ raise ValueError("Missing 'url' or 'page' in input file.")
135
+
136
+ page_indices = parse_page_range(page_field)
137
+ logger.info("Using page indices (0-indexed): %s", page_indices)
138
+
139
+ # Retrieve PDF bytes (supports URL or local file)
140
+ if url.startswith("http://") or url.startswith("https://"):
141
+ response = requests.get(url)
142
+ if response.status_code != 200:
143
+ logger.error("Failed to download PDF from %s. Status code: %d", url, response.status_code)
144
+ raise Exception(f"Failed to download PDF: {url}")
145
+ pdf_bytes = response.content
146
+ else:
147
+ with open(url, "rb") as f:
148
+ pdf_bytes = f.read()
149
+
150
+ subset_pdf_bytes = create_subset_pdf(pdf_bytes, page_indices)
151
+ logger.info("Created subset PDF with %d pages", len(page_indices))
152
+
153
+ dataset = PymuDocDataset(subset_pdf_bytes)
154
+ inference = doc_analyze(
155
+ dataset,
156
+ ocr=True,
157
+ lang=self.language,
158
+ layout_model=self.layout_model,
159
+ formula_enable=self.formula_enable,
160
+ table_enable=self.table_enable
161
+ )
162
+
163
+ base_path = f"/topic-extraction/{key}/"
164
+ writer = S3ImageWriter(self.s3_writer, "/topic-extraction/", self.gemini_api_key)
165
+ md_prefix = "/topic-extraction/"
166
+ pipe_result = inference.pipe_ocr_mode(writer, lang=self.language)
167
+ md_content = pipe_result.get_markdown(md_prefix)
168
+ final_markdown = writer.post_process(md_prefix, md_content)
169
+
170
+ output_md_path = os.path.join(self.output_folder, f"{key}_output.md")
171
+ with open(output_md_path, "w", encoding="utf-8") as f:
172
+ f.write(final_markdown)
173
+ logger.info("Markdown output saved to %s", output_md_path)
174
+
175
+ self.cleanup_gpu()
176
+ return final_markdown
177
+
178
+ def main():
179
+ message = {
180
+ "pattern": "topic_extraction",
181
+ "data": {
182
+ "input_files": [
183
+ {
184
+ "key": "sample_spec",
185
+ "url": "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf",
186
+ "type": "specification",
187
+ "page": [
188
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
189
+ 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39, 40, 41
190
+ ]
191
+ }
192
+ ],
193
+ "topics": [
194
+ {
195
+ "title": "Sample Topic",
196
+ "id": 123
197
+ }
198
+ ]
199
+ }
200
+ }
201
+ data = message.get("data", {})
202
+ input_files = data.get("input_files", [])
203
+
204
+ output_folder = "output"
205
+
206
+ gemini_api_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
 
 
 
 
 
 
 
 
 
 
207
 
208
+ s3_config = {
209
+ "ak": os.getenv("S3_ACCESS_KEY"),
210
+ "sk": os.getenv("S3_SECRET_KEY"),
211
+ "bucket": "quextro-resources",
212
+ "endpoint_url": os.getenv("S3_ENDPOINT")
213
+ }
214
+
215
+ processor = TopicExtractionProcessor(
216
+ gemini_api_key=gemini_api_key,
217
+ s3_config=s3_config,
218
+ output_folder=output_folder
219
+ )
220
+
221
+ for input_file in message["data"].get("input_files", []):
222
+ try:
223
+ logger.info("Processing input file with key: %s", input_file.get("key", ""))
224
+ final_md = processor.process_input_file(input_file)
225
+ logger.info("Processing completed for key: %s", input_file.get("key", ""))
226
+ except Exception as e:
227
+ logger.error("Error processing input file: %s", e)
228
 
229
  if __name__ == "__main__":
230
+ main()