correct page range handling
Browse files- __pycache__/inference_svm_model.cpython-310.pyc +0 -0
- __pycache__/mineru_single.cpython-310.pyc +0 -0
- __pycache__/table_row_extraction.cpython-310.pyc +0 -0
- __pycache__/worker.cpython-310.pyc +0 -0
- output/images/img_1.png +0 -0
- output/images/img_10.png +0 -0
- output/images/img_11.png +0 -0
- output/images/img_12.png +0 -0
- output/images/img_13.png +0 -0
- output/images/img_14.png +0 -0
- output/images/img_15.png +0 -0
- output/images/img_16.png +0 -0
- output/images/img_17.png +0 -0
- output/images/img_18.png +0 -0
- output/images/img_19.png +0 -0
- output/images/img_2.png +0 -0
- output/images/img_20.png +0 -0
- output/images/img_21.png +0 -0
- output/images/img_22.png +0 -0
- output/images/img_23.png +0 -0
- output/images/img_24.png +0 -0
- output/images/img_25.png +0 -0
- output/images/img_26.png +0 -0
- output/images/img_27.png +0 -0
- output/images/img_28.png +0 -0
- output/images/img_3.png +0 -0
- output/images/img_4.png +0 -0
- output/images/img_5.png +0 -0
- output/images/img_6.png +0 -0
- output/images/img_7.png +0 -0
- output/images/img_8.png +0 -0
- output/images/img_9.png +0 -0
- topic_extr.py +350 -309
__pycache__/inference_svm_model.cpython-310.pyc
CHANGED
Binary files a/__pycache__/inference_svm_model.cpython-310.pyc and b/__pycache__/inference_svm_model.cpython-310.pyc differ
|
|
__pycache__/mineru_single.cpython-310.pyc
CHANGED
Binary files a/__pycache__/mineru_single.cpython-310.pyc and b/__pycache__/mineru_single.cpython-310.pyc differ
|
|
__pycache__/table_row_extraction.cpython-310.pyc
CHANGED
Binary files a/__pycache__/table_row_extraction.cpython-310.pyc and b/__pycache__/table_row_extraction.cpython-310.pyc differ
|
|
__pycache__/worker.cpython-310.pyc
CHANGED
Binary files a/__pycache__/worker.cpython-310.pyc and b/__pycache__/worker.cpython-310.pyc differ
|
|
output/images/img_1.png
ADDED
![]() |
output/images/img_10.png
ADDED
![]() |
output/images/img_11.png
ADDED
![]() |
output/images/img_12.png
ADDED
![]() |
output/images/img_13.png
ADDED
![]() |
output/images/img_14.png
ADDED
![]() |
output/images/img_15.png
ADDED
![]() |
output/images/img_16.png
ADDED
![]() |
output/images/img_17.png
ADDED
![]() |
output/images/img_18.png
ADDED
![]() |
output/images/img_19.png
ADDED
![]() |
output/images/img_2.png
ADDED
![]() |
output/images/img_20.png
ADDED
![]() |
output/images/img_21.png
ADDED
![]() |
output/images/img_22.png
ADDED
![]() |
output/images/img_23.png
ADDED
![]() |
output/images/img_24.png
ADDED
![]() |
output/images/img_25.png
ADDED
![]() |
output/images/img_26.png
ADDED
![]() |
output/images/img_27.png
ADDED
![]() |
output/images/img_28.png
ADDED
![]() |
output/images/img_3.png
ADDED
![]() |
output/images/img_4.png
ADDED
![]() |
output/images/img_5.png
ADDED
![]() |
output/images/img_6.png
ADDED
![]() |
output/images/img_7.png
ADDED
![]() |
output/images/img_8.png
ADDED
![]() |
output/images/img_9.png
ADDED
![]() |
topic_extr.py
CHANGED
@@ -6,95 +6,119 @@ import json
|
|
6 |
import logging
|
7 |
import fitz
|
8 |
import base64
|
9 |
-
import
|
10 |
-
import numpy as np
|
11 |
from io import BytesIO
|
12 |
from typing import List, Dict, Any
|
13 |
-
import concurrent.futures
|
14 |
|
15 |
import torch
|
|
|
|
|
16 |
|
17 |
-
|
18 |
-
|
19 |
-
|
|
|
|
|
|
|
|
|
20 |
|
|
|
21 |
from magic_pdf.data.dataset import PymuDocDataset
|
22 |
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
23 |
|
|
|
24 |
from table_row_extraction import TableExtractor
|
25 |
|
|
|
|
|
|
|
26 |
logging.basicConfig(level=logging.INFO)
|
27 |
logger = logging.getLogger(__name__)
|
28 |
logger.setLevel(logging.INFO)
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> bytes:
|
31 |
"""
|
32 |
-
Creates a new PDF (in memory) containing only
|
|
|
33 |
"""
|
34 |
if not page_indices:
|
35 |
raise ValueError("No page indices provided for subset creation.")
|
36 |
|
37 |
doc = fitz.open(stream=original_pdf_bytes, filetype="pdf")
|
38 |
new_doc = fitz.open()
|
39 |
-
|
40 |
-
for p in sorted_pages:
|
41 |
if 0 <= p < doc.page_count:
|
42 |
new_doc.insert_pdf(doc, from_page=p, to_page=p)
|
43 |
else:
|
44 |
-
logger.error(f"Page index {p}
|
45 |
-
raise ValueError(f"Page index {p}
|
46 |
subset_bytes = new_doc.tobytes()
|
47 |
new_doc.close()
|
48 |
doc.close()
|
49 |
return subset_bytes
|
50 |
|
51 |
-
|
|
|
|
|
|
|
52 |
"""
|
53 |
-
|
54 |
-
|
55 |
"""
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
scale = max_dim / float(max(h, w))
|
67 |
-
if scale < 1.0:
|
68 |
-
new_w = int(w * scale)
|
69 |
-
new_h = int(h * scale)
|
70 |
-
img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
|
71 |
-
|
72 |
-
encode_params = [int(cv2.IMWRITE_JPEG_QUALITY), jpeg_quality]
|
73 |
-
success, enc = cv2.imencode(".jpg", img, encode_params)
|
74 |
-
if success:
|
75 |
-
return enc.tobytes()
|
76 |
-
else:
|
77 |
-
logger.warning("Could not encode resized image, returning original.")
|
78 |
-
return image_data
|
79 |
-
except Exception as e:
|
80 |
-
logger.warning(f"shrink_image_to_jpeg error: {e}. Returning original data.")
|
81 |
-
return image_data
|
82 |
|
|
|
|
|
|
|
83 |
class GeminiTopicExtractor:
|
84 |
"""
|
85 |
-
|
86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
"""
|
88 |
-
def __init__(self, api_key: str = None, num_pages: int =
|
89 |
self.api_key = api_key or os.getenv("GEMINI_API_KEY", "")
|
90 |
if not self.api_key:
|
91 |
-
logger.warning("No Gemini API key
|
92 |
self.num_pages = num_pages
|
93 |
|
94 |
-
def extract_subtopics(self, pdf_path: str) -> Dict[str,
|
95 |
-
|
96 |
-
|
97 |
-
|
|
|
|
|
|
|
|
|
98 |
return {}
|
99 |
|
100 |
if genai is None or types is None:
|
@@ -102,100 +126,198 @@ class GeminiTopicExtractor:
|
|
102 |
return {}
|
103 |
|
104 |
prompt = f"""
|
105 |
-
You
|
106 |
-
|
107 |
-
and structure them in a valid JSON format.
|
108 |
Instructions:
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
"""
|
128 |
-
|
129 |
try:
|
130 |
-
logger.debug("Calling Gemini to extract subtopics...")
|
131 |
client = genai.Client(api_key=self.api_key)
|
132 |
response = client.models.generate_content(
|
133 |
model="gemini-2.0-flash",
|
134 |
contents=[prompt],
|
135 |
config=types.GenerateContentConfig(temperature=0.0)
|
136 |
)
|
137 |
-
|
138 |
-
|
139 |
-
|
|
|
|
|
|
|
|
|
140 |
|
141 |
-
|
142 |
-
cleaned = raw_text.replace("```json", "").replace("```", "")
|
143 |
data = json.loads(cleaned)
|
144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
except Exception as e:
|
146 |
logger.error(f"Gemini subtopic extraction error: {e}")
|
147 |
return {}
|
148 |
|
149 |
-
def
|
150 |
text_parts = []
|
151 |
try:
|
152 |
doc = fitz.open(pdf_path)
|
153 |
-
pages_to_read = min(doc.page_count
|
154 |
-
for
|
155 |
-
|
156 |
-
text_parts.append(
|
157 |
doc.close()
|
158 |
except Exception as e:
|
159 |
-
logger.error(f"Could not open
|
160 |
return "\n".join(text_parts)
|
161 |
|
|
|
|
|
|
|
162 |
def call_gemini_for_table_classification(image_data: bytes, api_key: str) -> str:
|
163 |
"""
|
164 |
-
Classify an image as TWO_COLUMN, THREE_COLUMN, or NO_TABLE using Gemini
|
165 |
-
We shrink the image first to speed up requests.
|
166 |
"""
|
167 |
if not api_key:
|
168 |
-
logger.warning("No Gemini API key
|
169 |
return "NO_TABLE"
|
170 |
-
if
|
171 |
-
logger.warning("google.genai not installed
|
172 |
return "NO_TABLE"
|
173 |
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
|
|
|
|
|
|
|
|
|
|
192 |
TWO_COLUMN
|
193 |
THREE_COLUMN
|
194 |
NO_TABLE
|
195 |
"""
|
196 |
try:
|
197 |
client = genai.Client(api_key=api_key)
|
198 |
-
|
199 |
model="gemini-2.0-flash",
|
200 |
contents=[
|
201 |
{
|
@@ -204,7 +326,7 @@ NO_TABLE
|
|
204 |
{
|
205 |
"inline_data": {
|
206 |
"mime_type": "image/jpeg",
|
207 |
-
"data": base64.b64encode(
|
208 |
}
|
209 |
}
|
210 |
]
|
@@ -212,82 +334,23 @@ NO_TABLE
|
|
212 |
],
|
213 |
config=types.GenerateContentConfig(temperature=0.0)
|
214 |
)
|
215 |
-
if
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
elif "TWO" in classification:
|
223 |
-
return "TWO_COLUMN"
|
224 |
-
else:
|
225 |
-
return "NO_TABLE"
|
226 |
except Exception as e:
|
227 |
logger.error(f"Gemini table classification error: {e}")
|
228 |
return "NO_TABLE"
|
229 |
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
We also shrink the image first to reduce request time.
|
234 |
-
"""
|
235 |
-
if not api_key:
|
236 |
-
logger.warning("No Gemini API key found, returning fallback description.")
|
237 |
-
return "Image description unavailable"
|
238 |
-
if not genai or not types:
|
239 |
-
logger.warning("google.genai not installed, returning fallback description.")
|
240 |
-
return "Image description unavailable"
|
241 |
-
|
242 |
-
shrunk_data = shrink_image_to_jpeg(image_data, max_dim=800, jpeg_quality=80)
|
243 |
-
|
244 |
-
prompt_text = """The provided image is a part of a question paper or markscheme.
|
245 |
-
Extract all the necessary information from the image to be able to identify the question.
|
246 |
-
To identify the question, we only need the following: question number and question part.
|
247 |
-
Don't include redundant information.
|
248 |
-
For example, if image contains text like: "Q1 Part A Answer: Life on earth was created by diety..."
|
249 |
-
you should return just "Q1 Part A Mark Scheme"
|
250 |
-
If there is no text on this image, return the description of the image. 20 words max.
|
251 |
-
If there are not enough data, consider information from the surrounding context.
|
252 |
-
Additionally, if the image contains a truncated part, you must describe it and mark as a
|
253 |
-
part of some another image that goes before or after current image.
|
254 |
-
If the image is of a multiple-choice question’s options, then modify your answer by appending
|
255 |
-
'MCQ: A [option] B [option] C [option] D [option]' (replacing [option] with the actual options).
|
256 |
-
Otherwise, follow the above instructions strictly.
|
257 |
-
"""
|
258 |
-
try:
|
259 |
-
client = genai.Client(api_key=api_key)
|
260 |
-
response = client.models.generate_content(
|
261 |
-
model="gemini-2.0-flash",
|
262 |
-
contents=[
|
263 |
-
{
|
264 |
-
"parts": [
|
265 |
-
{"text": prompt_text},
|
266 |
-
{
|
267 |
-
"inline_data": {
|
268 |
-
"mime_type": "image/jpeg",
|
269 |
-
"data": base64.b64encode(shrunk_data).decode('utf-8')
|
270 |
-
}
|
271 |
-
}
|
272 |
-
]
|
273 |
-
}
|
274 |
-
],
|
275 |
-
config=types.GenerateContentConfig(temperature=0.0)
|
276 |
-
)
|
277 |
-
if response and response.text:
|
278 |
-
logger.info(f"[Gemini image description] LLM raw response:\n{response.text}")
|
279 |
-
|
280 |
-
return response.text.strip() if (response and response.text) else "Image description unavailable"
|
281 |
-
except Exception as e:
|
282 |
-
logger.error(f"Gemini image description error: {e}")
|
283 |
-
return "Image description unavailable"
|
284 |
-
|
285 |
class LocalImageWriter:
|
286 |
"""
|
287 |
-
|
288 |
-
and description calls. Finally modifies the Markdown to replace
|
289 |
-
references with final alt text. Also processes table images
|
290 |
-
into row/column cell images.
|
291 |
"""
|
292 |
def __init__(self, output_folder: str, gemini_api_key: str):
|
293 |
self.output_folder = output_folder
|
@@ -301,77 +364,45 @@ class LocalImageWriter:
|
|
301 |
self.gemini_api_key = gemini_api_key
|
302 |
|
303 |
def write(self, path: str, data: bytes) -> None:
|
304 |
-
"""
|
305 |
-
Called by magic-pdf to save each extracted image.
|
306 |
-
We store metadata so we can classify the images later.
|
307 |
-
"""
|
308 |
self._img_count += 1
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
with open(local_path, "wb") as f:
|
313 |
f.write(data)
|
314 |
-
|
315 |
-
rel_path_for_md = os.path.relpath(local_path, self.output_folder)
|
316 |
self.descriptions[path] = {
|
317 |
"data": data,
|
318 |
-
"relative_path":
|
319 |
"table_classification": "NO_TABLE",
|
320 |
"final_alt": ""
|
321 |
}
|
322 |
|
323 |
def post_process(self, key: str, md_content: str) -> str:
|
324 |
-
"""
|
325 |
-
1) Table classification calls (concurrent).
|
326 |
-
2) Image description calls for non-table images (concurrent).
|
327 |
-
3) Replace placeholders in the Markdown with final alt text.
|
328 |
-
4) Process table images => row/col cell images => update Markdown.
|
329 |
-
5) Keep only image-reference lines in the final Markdown.
|
330 |
-
"""
|
331 |
-
# 1) Table classification (CONCURRENT)
|
332 |
logger.info("Classifying images to detect tables (concurrent)...")
|
333 |
-
with concurrent.futures.ThreadPoolExecutor(max_workers=
|
334 |
-
|
335 |
for p, info in self.descriptions.items():
|
336 |
-
fut =
|
337 |
-
|
338 |
|
339 |
-
for fut in concurrent.futures.as_completed(
|
340 |
-
path =
|
341 |
try:
|
342 |
classification = fut.result()
|
343 |
self.descriptions[path]['table_classification'] = classification
|
344 |
except Exception as e:
|
345 |
-
logger.error(f"
|
346 |
self.descriptions[path]['table_classification'] = "NO_TABLE"
|
347 |
|
348 |
-
# 2)
|
349 |
-
logger.info("Generating image descriptions for non-table images (concurrent)...")
|
350 |
-
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
|
351 |
-
future_map_desc = {}
|
352 |
-
for p, info in self.descriptions.items():
|
353 |
-
if info['table_classification'] == "NO_TABLE":
|
354 |
-
fut = executor.submit(call_gemini_for_image_description, info["data"], self.gemini_api_key)
|
355 |
-
future_map_desc[fut] = p
|
356 |
-
|
357 |
-
for fut in concurrent.futures.as_completed(future_map_desc):
|
358 |
-
path = future_map_desc[fut]
|
359 |
-
try:
|
360 |
-
desc = fut.result()
|
361 |
-
self.descriptions[path]['final_alt'] = desc
|
362 |
-
except Exception as e:
|
363 |
-
logger.error(f"Error describing image {path}: {e}")
|
364 |
-
self.descriptions[path]['final_alt'] = "Image description unavailable"
|
365 |
-
|
366 |
-
# For images classified as 2/3-column tables => set alt
|
367 |
for p, info in self.descriptions.items():
|
368 |
cls = info['table_classification']
|
369 |
if cls == "TWO_COLUMN":
|
370 |
info['final_alt'] = "HAS TO BE PROCESSED - two column table"
|
371 |
elif cls == "THREE_COLUMN":
|
372 |
info['final_alt'] = "HAS TO BE PROCESSED - three column table"
|
373 |
-
|
374 |
-
info['final_alt'] = "
|
375 |
|
376 |
# 3) Replace placeholders in the Markdown
|
377 |
for p, info in self.descriptions.items():
|
@@ -379,10 +410,10 @@ class LocalImageWriter:
|
|
379 |
new_md = f"![{info['final_alt']}]({info['relative_path']})"
|
380 |
md_content = md_content.replace(old_md, new_md)
|
381 |
|
382 |
-
# 4)
|
383 |
md_content = self._process_table_images_in_markdown(md_content)
|
384 |
|
385 |
-
# 5) Keep only image
|
386 |
final_lines = []
|
387 |
for line in md_content.split("\n"):
|
388 |
if re.match(r"^\!\[.*\]\(.*\)", line.strip()):
|
@@ -391,13 +422,8 @@ class LocalImageWriter:
|
|
391 |
return "\n".join(final_lines)
|
392 |
|
393 |
def _process_table_images_in_markdown(self, md_content: str) -> str:
|
394 |
-
""
|
395 |
-
|
396 |
-
split into row/column cell images, and replace the single
|
397 |
-
table image reference with multiple cell references.
|
398 |
-
"""
|
399 |
-
pattern = r"!\[HAS TO BE PROCESSED - (two|three) column table\]\(([^)]+)\)"
|
400 |
-
matches = re.findall(pattern, md_content, flags=re.IGNORECASE)
|
401 |
if not matches:
|
402 |
return md_content
|
403 |
|
@@ -419,36 +445,38 @@ class LocalImageWriter:
|
|
419 |
enable_subtopic_merge=False,
|
420 |
subtopic_threshold=0.2
|
421 |
)
|
422 |
-
|
423 |
row_boxes = extractor.process_image(abs_image_path)
|
424 |
out_folder = abs_image_path + "_rows"
|
425 |
os.makedirs(out_folder, exist_ok=True)
|
426 |
extractor.save_extracted_cells(abs_image_path, row_boxes, out_folder)
|
427 |
|
428 |
-
|
429 |
for i, row in enumerate(row_boxes):
|
430 |
row_dir = os.path.join(out_folder, f"row_{i}")
|
431 |
for j, _ in enumerate(row):
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
|
437 |
-
|
438 |
old_line = f""
|
439 |
-
md_content = md_content.replace(old_line,
|
440 |
except Exception as e:
|
441 |
logger.error(f"Error processing table image {image_path}: {e}")
|
442 |
|
443 |
return md_content
|
444 |
|
|
|
|
|
|
|
445 |
class MineruNoTextProcessor:
|
446 |
"""
|
447 |
-
1)
|
448 |
-
2)
|
449 |
-
3)
|
450 |
-
4)
|
451 |
-
|
452 |
"""
|
453 |
def __init__(self, output_folder: str, gemini_api_key: str = None):
|
454 |
self.output_folder = output_folder
|
@@ -456,11 +484,11 @@ class MineruNoTextProcessor:
|
|
456 |
|
457 |
self.layout_model = "doclayout_yolo"
|
458 |
self.formula_enable = True
|
459 |
-
# keep table_enable=False so that entire table is an image
|
460 |
self.table_enable = False
|
461 |
self.language = "en"
|
462 |
|
463 |
-
|
|
|
464 |
self.gemini_api_key = gemini_api_key or os.getenv("GEMINI_API_KEY", "")
|
465 |
|
466 |
def cleanup_gpu(self):
|
@@ -474,41 +502,71 @@ class MineruNoTextProcessor:
|
|
474 |
def process(self, pdf_path: str) -> str:
|
475 |
logger.info(f"Processing PDF: {pdf_path}")
|
476 |
try:
|
477 |
-
# 1) Extract subtopics from
|
478 |
-
|
479 |
-
|
480 |
-
raise ValueError("No valid topics extracted from the PDF's table of contents.")
|
481 |
-
|
482 |
-
# 2) Flatten page indices from all topics (1-based)
|
483 |
-
page_indices = self._collect_page_indices(topics_data)
|
484 |
-
if not page_indices:
|
485 |
-
raise ValueError("Extracted page indices are empty.")
|
486 |
|
487 |
-
#
|
488 |
with open(pdf_path, "rb") as f:
|
489 |
-
|
490 |
-
|
491 |
-
# 4) Validate pages and create subset (convert 1-based to 0-based)
|
492 |
-
doc = fitz.open(stream=original_pdf_bytes, filetype="pdf")
|
493 |
total_pages = doc.page_count
|
494 |
doc.close()
|
495 |
|
496 |
-
|
497 |
-
|
498 |
-
|
499 |
-
|
500 |
-
|
501 |
-
|
502 |
-
|
503 |
-
|
504 |
-
|
505 |
-
|
506 |
-
|
507 |
-
|
508 |
-
|
509 |
-
|
510 |
-
|
511 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
512 |
dataset = PymuDocDataset(subset_pdf_bytes)
|
513 |
inference = doc_analyze(
|
514 |
dataset,
|
@@ -520,52 +578,35 @@ class MineruNoTextProcessor:
|
|
520 |
)
|
521 |
logger.info("doc_analyze complete. Extracting images...")
|
522 |
|
523 |
-
|
524 |
-
|
525 |
-
pipe_result = inference.pipe_ocr_mode(image_writer, lang=self.language)
|
526 |
md_content = pipe_result.get_markdown("local-unique-prefix/")
|
527 |
|
528 |
-
|
529 |
-
final_markdown = image_writer.post_process("local-unique-prefix/", md_content)
|
530 |
|
531 |
-
#
|
532 |
-
|
533 |
-
with open(
|
534 |
f.write(final_markdown)
|
535 |
|
536 |
-
logger.info(f"Markdown saved to: {
|
537 |
return final_markdown
|
538 |
|
539 |
finally:
|
540 |
self.cleanup_gpu()
|
541 |
|
542 |
-
|
543 |
-
|
544 |
-
|
545 |
-
Example: {"Topic A": [11,29], "Topic B": [30,42]} => [11..29, 30..42]
|
546 |
-
"""
|
547 |
-
pages = []
|
548 |
-
for topic, rng in topics_data.items():
|
549 |
-
if isinstance(rng, list) and len(rng) == 2:
|
550 |
-
start_p, end_p = rng
|
551 |
-
if start_p > end_p:
|
552 |
-
logger.error(f"Invalid page range for topic '{topic}': {rng}")
|
553 |
-
raise ValueError(f"Invalid page range for topic '{topic}': {rng}")
|
554 |
-
pages.extend(range(start_p, end_p + 1))
|
555 |
-
else:
|
556 |
-
logger.warning(f"Skipping topic '{topic}' with invalid range: {rng}")
|
557 |
-
return pages
|
558 |
-
|
559 |
if __name__ == "__main__":
|
560 |
input_pdf = "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf"
|
561 |
-
output_dir = "/home/user/app/
|
562 |
-
|
563 |
gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
|
564 |
|
565 |
try:
|
566 |
processor = MineruNoTextProcessor(output_folder=output_dir, gemini_api_key=gemini_key)
|
567 |
-
|
568 |
print("Final Markdown Output:")
|
569 |
-
print(
|
570 |
except Exception as e:
|
571 |
logger.error(f"Processing failed: {e}")
|
|
|
6 |
import logging
|
7 |
import fitz
|
8 |
import base64
|
9 |
+
import concurrent.futures
|
|
|
10 |
from io import BytesIO
|
11 |
from typing import List, Dict, Any
|
|
|
12 |
|
13 |
import torch
|
14 |
+
import cv2
|
15 |
+
import numpy as np
|
16 |
|
17 |
+
# Attempt top-level import of google.genai
|
18 |
+
try:
|
19 |
+
from google import genai
|
20 |
+
from google.genai import types
|
21 |
+
except ImportError:
|
22 |
+
genai = None
|
23 |
+
types = None
|
24 |
|
25 |
+
# magic-pdf imports
|
26 |
from magic_pdf.data.dataset import PymuDocDataset
|
27 |
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
28 |
|
29 |
+
# table extraction logic
|
30 |
from table_row_extraction import TableExtractor
|
31 |
|
32 |
+
###############################################################################
|
33 |
+
# Logging Setup
|
34 |
+
###############################################################################
|
35 |
logging.basicConfig(level=logging.INFO)
|
36 |
logger = logging.getLogger(__name__)
|
37 |
logger.setLevel(logging.INFO)
|
38 |
|
39 |
+
###############################################################################
|
40 |
+
# PDF Utility Functions
|
41 |
+
###############################################################################
|
42 |
+
def unify_whitespace(text: str) -> str:
|
43 |
+
"""
|
44 |
+
Replace runs of whitespace with a single space, strip leading/trailing, then lowercase.
|
45 |
+
"""
|
46 |
+
return re.sub(r"\s+", " ", text).strip().lower()
|
47 |
+
|
48 |
def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> bytes:
|
49 |
"""
|
50 |
+
Creates a new PDF (in memory) containing only pages in page_indices (0-based).
|
51 |
+
Raises ValueError if page_indices is empty or out of range.
|
52 |
"""
|
53 |
if not page_indices:
|
54 |
raise ValueError("No page indices provided for subset creation.")
|
55 |
|
56 |
doc = fitz.open(stream=original_pdf_bytes, filetype="pdf")
|
57 |
new_doc = fitz.open()
|
58 |
+
for p in sorted(set(page_indices)):
|
|
|
59 |
if 0 <= p < doc.page_count:
|
60 |
new_doc.insert_pdf(doc, from_page=p, to_page=p)
|
61 |
else:
|
62 |
+
logger.error(f"Page index {p} out of range (0..{doc.page_count - 1}).")
|
63 |
+
raise ValueError(f"Page index {p} out of range.")
|
64 |
subset_bytes = new_doc.tobytes()
|
65 |
new_doc.close()
|
66 |
doc.close()
|
67 |
return subset_bytes
|
68 |
|
69 |
+
###############################################################################
|
70 |
+
# Searching in PDF
|
71 |
+
###############################################################################
|
72 |
+
def find_all_occurrences(pdf_bytes: bytes, search_text: str) -> List[int]:
|
73 |
"""
|
74 |
+
Return a sorted list of 0-based pages in which `search_text` (normalized) appears,
|
75 |
+
scanning the entire PDF in RAW mode.
|
76 |
"""
|
77 |
+
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
78 |
+
st_norm = unify_whitespace(search_text)
|
79 |
+
found = []
|
80 |
+
for i in range(doc.page_count):
|
81 |
+
raw = doc[i].get_text("raw")
|
82 |
+
norm = unify_whitespace(raw)
|
83 |
+
if st_norm in norm:
|
84 |
+
found.append(i)
|
85 |
+
doc.close()
|
86 |
+
return sorted(found)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
|
88 |
+
###############################################################################
|
89 |
+
# Gemini LLM for Subtopic Extraction
|
90 |
+
###############################################################################
|
91 |
class GeminiTopicExtractor:
|
92 |
"""
|
93 |
+
Extract subtopics from the PDF by reading the first `num_pages` pages, calling Gemini.
|
94 |
+
We expect a structure like:
|
95 |
+
{
|
96 |
+
"2 Subject content and assessment information": {
|
97 |
+
"Paper 1 and Paper 2: Pure Mathematics": [11, 29],
|
98 |
+
"Paper 3: Statistics and Mechanics": [30, 42]
|
99 |
+
}
|
100 |
+
}
|
101 |
+
or sometimes just a flat dict:
|
102 |
+
{
|
103 |
+
"Paper 1 and Paper 2: Pure Mathematics": [15, 33],
|
104 |
+
"Paper 3: Statistics and Mechanics": [34, 46]
|
105 |
+
}
|
106 |
+
We'll parse both forms.
|
107 |
"""
|
108 |
+
def __init__(self, api_key: str = None, num_pages: int = 10):
|
109 |
self.api_key = api_key or os.getenv("GEMINI_API_KEY", "")
|
110 |
if not self.api_key:
|
111 |
+
logger.warning("No Gemini API key for subtopic extraction.")
|
112 |
self.num_pages = num_pages
|
113 |
|
114 |
+
def extract_subtopics(self, pdf_path: str) -> Dict[str, List[int]]:
|
115 |
+
"""
|
116 |
+
Return a dict of subtopics => [start_page, end_page].
|
117 |
+
Could be empty if parsing fails or the LLM can't find subtopics.
|
118 |
+
"""
|
119 |
+
first_pages_text = self._read_first_pages_raw(pdf_path, self.num_pages)
|
120 |
+
if not first_pages_text.strip():
|
121 |
+
logger.error("No text from first pages => cannot extract subtopics.")
|
122 |
return {}
|
123 |
|
124 |
if genai is None or types is None:
|
|
|
126 |
return {}
|
127 |
|
128 |
prompt = f"""
|
129 |
+
You have the first pages of a PDF specification, including a table of contents.
|
130 |
+
|
|
|
131 |
Instructions:
|
132 |
+
1. Identify the 'Contents' section listing all topics, subtopics, and their corresponding pages.
|
133 |
+
2. Extract subtopic names -> [start_page, end_page], in valid JSON format only.
|
134 |
+
3. If you can't find any subtopics, return an empty JSON.
|
135 |
+
|
136 |
+
Examples:
|
137 |
+
|
138 |
+
1. Given this table of contents:
|
139 |
+
|
140 |
+
1 Introduction – 2
|
141 |
+
Why choose Edexcel A Level Mathematics? - 2
|
142 |
+
Supporting you in planning and implementing this qualification - 3
|
143 |
+
Qualification at a glance - 5
|
144 |
+
2 Subject content and assessment information – 7
|
145 |
+
Paper 1 and Paper 2: Pure Mathematics - 11
|
146 |
+
Paper 3: Statistics and Mechanics - 30
|
147 |
+
Assessment Objectives - 40
|
148 |
+
3 Administration and general information – 42
|
149 |
+
Entries - 42
|
150 |
+
Access arrangements, reasonable adjustments, special consideration and malpractice - 42
|
151 |
+
Student recruitment and progression - 45
|
152 |
+
Appendix 1: Formulae – 49
|
153 |
+
Appendix 2: Notation – 53
|
154 |
+
Appendix 3: Use of calculators – 59
|
155 |
+
Appendix 4: Assessment Objectives – 60
|
156 |
+
Appendix 5: The context for the development of this qualification – 62
|
157 |
+
Appendix 6: Transferable skills – 64
|
158 |
+
Appendix 7: Level 3 Extended Project qualification – 65
|
159 |
+
Appendix 8: Codes – 67
|
160 |
+
|
161 |
+
The correct output should be:
|
162 |
+
|
163 |
+
{{
|
164 |
+
"Paper 1 and Paper 2: Pure Mathematics": [11, 29],
|
165 |
+
"Paper 3: Statistics and Mechanics": [30, 42]
|
166 |
+
}}
|
167 |
+
|
168 |
+
2. Given this table of contents:
|
169 |
+
|
170 |
+
Qualification at a glance – 1
|
171 |
+
Assessment Objectives and weightings - 4
|
172 |
+
Knowledge, skills and understanding – 5
|
173 |
+
Theme 1: Introduction to markets and market failure - 5
|
174 |
+
Theme 2: The UK economy – performance and policies - 11
|
175 |
+
Theme 3: Business behaviour and the labour market - 21
|
176 |
+
Theme 4: A global perspective - 29
|
177 |
+
Assessment – 39
|
178 |
+
Assessment summary - 39
|
179 |
+
Assessment objectives - 41
|
180 |
+
Assessment overview - 42
|
181 |
+
Breakdown of assessment objectives - 42
|
182 |
+
Synoptic assessment - 43
|
183 |
+
Discount code and performance tables - 43
|
184 |
+
Access arrangements, reasonable adjustments and special consideration - 44
|
185 |
+
Malpractice - 45
|
186 |
+
Equality Act 2010 and Pearson equality policy - 45
|
187 |
+
Synoptic assessment - 46
|
188 |
+
Awarding and reporting - 47
|
189 |
+
Other information – 49
|
190 |
+
Student recruitment -49
|
191 |
+
Prior learning and other requirements -49
|
192 |
+
Progression - 49
|
193 |
+
Appendix 1: Transferable skills – 53
|
194 |
+
Appendix 2: Level 3 Extended Project qualification – 55
|
195 |
+
Appendix 3: Quantitative skills – 59
|
196 |
+
Appendix 4: Codes – 61
|
197 |
+
Appendix 5: Index – 63
|
198 |
+
|
199 |
+
The correct output should be:
|
200 |
+
|
201 |
+
{{
|
202 |
+
"Theme 1: Introduction to markets and market failure": [5, 10]
|
203 |
+
"Theme 2: The UK economy – performance and policies": - [11, 20]
|
204 |
+
"Theme 3: Business behaviour and the labour market": [21, 28]
|
205 |
+
"Theme 4: A global perspective": [29, 38]
|
206 |
+
}}
|
207 |
+
|
208 |
+
Now, extract topics from this text:
|
209 |
+
{first_pages_text}
|
210 |
"""
|
|
|
211 |
try:
|
|
|
212 |
client = genai.Client(api_key=self.api_key)
|
213 |
response = client.models.generate_content(
|
214 |
model="gemini-2.0-flash",
|
215 |
contents=[prompt],
|
216 |
config=types.GenerateContentConfig(temperature=0.0)
|
217 |
)
|
218 |
+
if not response or not response.text:
|
219 |
+
logger.warning("No text from LLM => returning empty subtopics.")
|
220 |
+
return {}
|
221 |
+
|
222 |
+
raw_json = response.text.strip()
|
223 |
+
# Clean up triple backticks
|
224 |
+
cleaned = raw_json.replace("```json", "").replace("```", "")
|
225 |
|
226 |
+
# Attempt to parse
|
|
|
227 |
data = json.loads(cleaned)
|
228 |
+
# data might be nested or flat
|
229 |
+
# if nested, e.g. {"2 Subject content": {"Paper 1...": [11,29]}}
|
230 |
+
# if flat, e.g. {"Paper 1...": [11,29]}
|
231 |
+
# We'll unify it to a single dict of subname => [start,end].
|
232 |
+
final_dict = {}
|
233 |
+
|
234 |
+
# If the top-level is a dict of dict
|
235 |
+
# We look for a dict whose values are themselves subtopics
|
236 |
+
# Or it might be a direct subtopic dict
|
237 |
+
# We'll try a quick approach:
|
238 |
+
# - If any top-level value is a dict with numeric arrays, use that
|
239 |
+
# - else assume data is the direct subtopic dict
|
240 |
+
found_sub_dict = None
|
241 |
+
for k, v in data.items():
|
242 |
+
if isinstance(v, dict):
|
243 |
+
# might be the sub-sub dict
|
244 |
+
found_sub_dict = v
|
245 |
+
break
|
246 |
+
|
247 |
+
if found_sub_dict is not None:
|
248 |
+
# parse found_sub_dict
|
249 |
+
for subk, rng in found_sub_dict.items():
|
250 |
+
if isinstance(rng, list) and len(rng) == 2:
|
251 |
+
final_dict[subk] = rng
|
252 |
+
else:
|
253 |
+
# maybe data is the direct subtopic dict
|
254 |
+
# parse data
|
255 |
+
for subk, rng in data.items():
|
256 |
+
if isinstance(rng, list) and len(rng) == 2:
|
257 |
+
final_dict[subk] = rng
|
258 |
+
|
259 |
+
return final_dict
|
260 |
except Exception as e:
|
261 |
logger.error(f"Gemini subtopic extraction error: {e}")
|
262 |
return {}
|
263 |
|
264 |
+
def _read_first_pages_raw(self, pdf_path: str, num_pages: int) -> str:
|
265 |
text_parts = []
|
266 |
try:
|
267 |
doc = fitz.open(pdf_path)
|
268 |
+
pages_to_read = min(num_pages, doc.page_count)
|
269 |
+
for i in range(pages_to_read):
|
270 |
+
raw_text = doc[i].get_text("raw")
|
271 |
+
text_parts.append(raw_text)
|
272 |
doc.close()
|
273 |
except Exception as e:
|
274 |
+
logger.error(f"Could not open PDF: {e}")
|
275 |
return "\n".join(text_parts)
|
276 |
|
277 |
+
###############################################################################
|
278 |
+
# Concurrency for Table Classification
|
279 |
+
###############################################################################
|
280 |
def call_gemini_for_table_classification(image_data: bytes, api_key: str) -> str:
|
281 |
"""
|
282 |
+
Classify an image as TWO_COLUMN, THREE_COLUMN, or NO_TABLE using Gemini.
|
|
|
283 |
"""
|
284 |
if not api_key:
|
285 |
+
logger.warning("No Gemini API key => NO_TABLE.")
|
286 |
return "NO_TABLE"
|
287 |
+
if genai is None or types is None:
|
288 |
+
logger.warning("google.genai not installed => NO_TABLE.")
|
289 |
return "NO_TABLE"
|
290 |
|
291 |
+
# Attempt to shrink
|
292 |
+
try:
|
293 |
+
arr = np.frombuffer(image_data, np.uint8)
|
294 |
+
img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
|
295 |
+
if img is not None:
|
296 |
+
h, w, _ = img.shape
|
297 |
+
max_dim = 800
|
298 |
+
scale = 1.0
|
299 |
+
if max(h, w) > max_dim:
|
300 |
+
scale = max_dim / float(max(h, w))
|
301 |
+
if scale < 1.0:
|
302 |
+
new_w = int(w * scale)
|
303 |
+
new_h = int(h * scale)
|
304 |
+
img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
|
305 |
+
encode_params = [int(cv2.IMWRITE_JPEG_QUALITY), 70]
|
306 |
+
success, enc = cv2.imencode(".jpg", img, encode_params)
|
307 |
+
if success:
|
308 |
+
image_data = enc.tobytes()
|
309 |
+
except Exception as e:
|
310 |
+
logger.warning(f"shrink_image_to_jpeg error: {e}")
|
311 |
+
|
312 |
+
prompt = """You are given an image. Determine if it shows a table that has exactly 2 or 3 columns.
|
313 |
+
Return only one label:
|
314 |
TWO_COLUMN
|
315 |
THREE_COLUMN
|
316 |
NO_TABLE
|
317 |
"""
|
318 |
try:
|
319 |
client = genai.Client(api_key=api_key)
|
320 |
+
resp = client.models.generate_content(
|
321 |
model="gemini-2.0-flash",
|
322 |
contents=[
|
323 |
{
|
|
|
326 |
{
|
327 |
"inline_data": {
|
328 |
"mime_type": "image/jpeg",
|
329 |
+
"data": base64.b64encode(image_data).decode('utf-8')
|
330 |
}
|
331 |
}
|
332 |
]
|
|
|
334 |
],
|
335 |
config=types.GenerateContentConfig(temperature=0.0)
|
336 |
)
|
337 |
+
if resp and resp.text:
|
338 |
+
classification = resp.text.strip().upper()
|
339 |
+
if "THREE" in classification:
|
340 |
+
return "THREE_COLUMN"
|
341 |
+
elif "TWO" in classification:
|
342 |
+
return "TWO_COLUMN"
|
343 |
+
return "NO_TABLE"
|
|
|
|
|
|
|
|
|
344 |
except Exception as e:
|
345 |
logger.error(f"Gemini table classification error: {e}")
|
346 |
return "NO_TABLE"
|
347 |
|
348 |
+
###############################################################################
|
349 |
+
# LocalImageWriter
|
350 |
+
###############################################################################
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
351 |
class LocalImageWriter:
|
352 |
"""
|
353 |
+
Writes extracted images, then does concurrency-based table classification calls.
|
|
|
|
|
|
|
354 |
"""
|
355 |
def __init__(self, output_folder: str, gemini_api_key: str):
|
356 |
self.output_folder = output_folder
|
|
|
364 |
self.gemini_api_key = gemini_api_key
|
365 |
|
366 |
def write(self, path: str, data: bytes) -> None:
|
|
|
|
|
|
|
|
|
367 |
self._img_count += 1
|
368 |
+
fname = f"img_{self._img_count}.png"
|
369 |
+
fpath = os.path.join(self.images_dir, fname)
|
370 |
+
with open(fpath, "wb") as f:
|
|
|
371 |
f.write(data)
|
372 |
+
rel_path = os.path.relpath(fpath, self.output_folder)
|
|
|
373 |
self.descriptions[path] = {
|
374 |
"data": data,
|
375 |
+
"relative_path": rel_path,
|
376 |
"table_classification": "NO_TABLE",
|
377 |
"final_alt": ""
|
378 |
}
|
379 |
|
380 |
def post_process(self, key: str, md_content: str) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
381 |
logger.info("Classifying images to detect tables (concurrent)...")
|
382 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=6) as exe:
|
383 |
+
fut_map = {}
|
384 |
for p, info in self.descriptions.items():
|
385 |
+
fut = exe.submit(call_gemini_for_table_classification, info["data"], self.gemini_api_key)
|
386 |
+
fut_map[fut] = p
|
387 |
|
388 |
+
for fut in concurrent.futures.as_completed(fut_map):
|
389 |
+
path = fut_map[fut]
|
390 |
try:
|
391 |
classification = fut.result()
|
392 |
self.descriptions[path]['table_classification'] = classification
|
393 |
except Exception as e:
|
394 |
+
logger.error(f"Table classification error: {e}")
|
395 |
self.descriptions[path]['table_classification'] = "NO_TABLE"
|
396 |
|
397 |
+
# 2) Set final alt text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
398 |
for p, info in self.descriptions.items():
|
399 |
cls = info['table_classification']
|
400 |
if cls == "TWO_COLUMN":
|
401 |
info['final_alt'] = "HAS TO BE PROCESSED - two column table"
|
402 |
elif cls == "THREE_COLUMN":
|
403 |
info['final_alt'] = "HAS TO BE PROCESSED - three column table"
|
404 |
+
else:
|
405 |
+
info['final_alt'] = "NO_TABLE image"
|
406 |
|
407 |
# 3) Replace placeholders in the Markdown
|
408 |
for p, info in self.descriptions.items():
|
|
|
410 |
new_md = f"![{info['final_alt']}]({info['relative_path']})"
|
411 |
md_content = md_content.replace(old_md, new_md)
|
412 |
|
413 |
+
# 4) If any table images => extract rows
|
414 |
md_content = self._process_table_images_in_markdown(md_content)
|
415 |
|
416 |
+
# 5) Keep only lines that are image references
|
417 |
final_lines = []
|
418 |
for line in md_content.split("\n"):
|
419 |
if re.match(r"^\!\[.*\]\(.*\)", line.strip()):
|
|
|
422 |
return "\n".join(final_lines)
|
423 |
|
424 |
def _process_table_images_in_markdown(self, md_content: str) -> str:
|
425 |
+
pat = r"!\[HAS TO BE PROCESSED - (two|three) column table\]\(([^)]+)\)"
|
426 |
+
matches = re.findall(pat, md_content, flags=re.IGNORECASE)
|
|
|
|
|
|
|
|
|
|
|
427 |
if not matches:
|
428 |
return md_content
|
429 |
|
|
|
445 |
enable_subtopic_merge=False,
|
446 |
subtopic_threshold=0.2
|
447 |
)
|
|
|
448 |
row_boxes = extractor.process_image(abs_image_path)
|
449 |
out_folder = abs_image_path + "_rows"
|
450 |
os.makedirs(out_folder, exist_ok=True)
|
451 |
extractor.save_extracted_cells(abs_image_path, row_boxes, out_folder)
|
452 |
|
453 |
+
snippet = ["**Extracted table cells:**"]
|
454 |
for i, row in enumerate(row_boxes):
|
455 |
row_dir = os.path.join(out_folder, f"row_{i}")
|
456 |
for j, _ in enumerate(row):
|
457 |
+
cell_file = f"col_{j}.png"
|
458 |
+
cell_path = os.path.join(row_dir, cell_file)
|
459 |
+
relp = os.path.relpath(cell_path, self.output_folder)
|
460 |
+
snippet.append(f"")
|
461 |
|
462 |
+
new_snip = "\n".join(snippet)
|
463 |
old_line = f""
|
464 |
+
md_content = md_content.replace(old_line, new_snip)
|
465 |
except Exception as e:
|
466 |
logger.error(f"Error processing table image {image_path}: {e}")
|
467 |
|
468 |
return md_content
|
469 |
|
470 |
+
###############################################################################
|
471 |
+
# MineruNoTextProcessor
|
472 |
+
###############################################################################
|
473 |
class MineruNoTextProcessor:
|
474 |
"""
|
475 |
+
1) Use Gemini to get subtopics => e.g. {"Paper 1 and Paper 2: Pure Mathematics": [11,29], ...}
|
476 |
+
2) For each subtopic name => find real occurrence in PDF at or after (start_page-1).
|
477 |
+
3) offset = occurrence_page - (start_page-1). clamp offset >= 0
|
478 |
+
4) Flatten final pages, subset PDF, run magic-pdf => concurrency => final MD
|
479 |
+
5) If no subtopics found, process entire PDF as fallback.
|
480 |
"""
|
481 |
def __init__(self, output_folder: str, gemini_api_key: str = None):
|
482 |
self.output_folder = output_folder
|
|
|
484 |
|
485 |
self.layout_model = "doclayout_yolo"
|
486 |
self.formula_enable = True
|
|
|
487 |
self.table_enable = False
|
488 |
self.language = "en"
|
489 |
|
490 |
+
# Use our new flexible approach
|
491 |
+
self.subtopic_extractor = GeminiTopicExtractor(api_key=gemini_api_key, num_pages=10)
|
492 |
self.gemini_api_key = gemini_api_key or os.getenv("GEMINI_API_KEY", "")
|
493 |
|
494 |
def cleanup_gpu(self):
|
|
|
502 |
def process(self, pdf_path: str) -> str:
|
503 |
logger.info(f"Processing PDF: {pdf_path}")
|
504 |
try:
|
505 |
+
# 1) Extract subtopics from Gemini
|
506 |
+
subtopics = self.subtopic_extractor.extract_subtopics(pdf_path)
|
507 |
+
logger.info(f"Gemini returned subtopics: {subtopics}")
|
|
|
|
|
|
|
|
|
|
|
|
|
508 |
|
509 |
+
# 2) Read entire PDF
|
510 |
with open(pdf_path, "rb") as f:
|
511 |
+
pdf_bytes = f.read()
|
512 |
+
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
|
|
|
|
513 |
total_pages = doc.page_count
|
514 |
doc.close()
|
515 |
|
516 |
+
final_pages = set()
|
517 |
+
if not subtopics:
|
518 |
+
logger.warning("No subtopics found. We'll process the entire PDF as fallback.")
|
519 |
+
final_pages = set(range(total_pages))
|
520 |
+
else:
|
521 |
+
# For each subtopic, find occurrence >= (start_p-1)
|
522 |
+
for subname, rng in subtopics.items():
|
523 |
+
if not (isinstance(rng, list) and len(rng) == 2):
|
524 |
+
logger.warning(f"Skipping subtopic '{subname}' => invalid range {rng}")
|
525 |
+
continue
|
526 |
+
start_p, end_p = rng
|
527 |
+
if start_p > end_p:
|
528 |
+
logger.warning(f"Skipping subtopic '{subname}' => start> end {rng}")
|
529 |
+
continue
|
530 |
+
|
531 |
+
# find occurrences
|
532 |
+
occs = find_all_occurrences(pdf_bytes, subname)
|
533 |
+
logger.info(f"Occurrences of subtopic '{subname}': {occs}")
|
534 |
+
|
535 |
+
doc_start_0 = start_p - 1
|
536 |
+
chosen_page = None
|
537 |
+
for p in occs:
|
538 |
+
if p >= doc_start_0:
|
539 |
+
chosen_page = p
|
540 |
+
break
|
541 |
+
if chosen_page is None:
|
542 |
+
# fallback to last or 0
|
543 |
+
if occs:
|
544 |
+
chosen_page = occs[-1]
|
545 |
+
logger.warning(f"No occurrence >= {doc_start_0} for '{subname}'. Using last => {chosen_page}")
|
546 |
+
else:
|
547 |
+
chosen_page = 0
|
548 |
+
logger.warning(f"No occurrences for '{subname}'. Using page 0.")
|
549 |
+
|
550 |
+
raw_offset = chosen_page - doc_start_0
|
551 |
+
offset = max(0, raw_offset)
|
552 |
+
logger.info(f"Subtopic '{subname}': doc_start={start_p}, chosen_page={chosen_page}, raw_offset={raw_offset}, offset={offset}")
|
553 |
+
|
554 |
+
s0 = (start_p - 1) + offset
|
555 |
+
e0 = (end_p - 1) + offset
|
556 |
+
s0 = max(0, min(total_pages - 1, s0))
|
557 |
+
e0 = max(0, min(total_pages - 1, e0))
|
558 |
+
for pp in range(s0, e0 + 1):
|
559 |
+
final_pages.add(pp)
|
560 |
+
|
561 |
+
# 3) If final_pages is empty => fallback entire PDF
|
562 |
+
if not final_pages:
|
563 |
+
logger.warning("No valid pages after offset. We'll process entire PDF.")
|
564 |
+
final_pages = set(range(total_pages))
|
565 |
+
|
566 |
+
logger.info(f"Processing pages (0-based): {sorted(final_pages)}")
|
567 |
+
subset_pdf_bytes = create_subset_pdf(pdf_bytes, sorted(final_pages))
|
568 |
+
|
569 |
+
# 4) doc_analyze => concurrency => final MD
|
570 |
dataset = PymuDocDataset(subset_pdf_bytes)
|
571 |
inference = doc_analyze(
|
572 |
dataset,
|
|
|
578 |
)
|
579 |
logger.info("doc_analyze complete. Extracting images...")
|
580 |
|
581 |
+
writer = LocalImageWriter(self.output_folder, self.gemini_api_key)
|
582 |
+
pipe_result = inference.pipe_ocr_mode(writer, lang=self.language)
|
|
|
583 |
md_content = pipe_result.get_markdown("local-unique-prefix/")
|
584 |
|
585 |
+
final_markdown = writer.post_process("local-unique-prefix/", md_content)
|
|
|
586 |
|
587 |
+
# 5) Save
|
588 |
+
out_path = os.path.join(self.output_folder, "final_output.md")
|
589 |
+
with open(out_path, "w", encoding="utf-8") as f:
|
590 |
f.write(final_markdown)
|
591 |
|
592 |
+
logger.info(f"Markdown saved to: {out_path}")
|
593 |
return final_markdown
|
594 |
|
595 |
finally:
|
596 |
self.cleanup_gpu()
|
597 |
|
598 |
+
###############################################################################
|
599 |
+
# Example Main
|
600 |
+
###############################################################################
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
601 |
if __name__ == "__main__":
|
602 |
input_pdf = "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf"
|
603 |
+
output_dir = "/home/user/app/output"
|
|
|
604 |
gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
|
605 |
|
606 |
try:
|
607 |
processor = MineruNoTextProcessor(output_folder=output_dir, gemini_api_key=gemini_key)
|
608 |
+
md_output = processor.process(input_pdf)
|
609 |
print("Final Markdown Output:")
|
610 |
+
print(md_output)
|
611 |
except Exception as e:
|
612 |
logger.error(f"Processing failed: {e}")
|