change the logic
Browse files- __pycache__/contents_extractor_v2.cpython-310.pyc +0 -0
- __pycache__/mineru_test_local.cpython-310.pyc +0 -0
- __pycache__/topic_extraction_upgrade.cpython-310.pyc +0 -0
- input_output/outpu/images/img_1.png +0 -0
- input_output/outpu/images/img_10.png +0 -0
- input_output/outpu/images/img_11.png +0 -0
- input_output/outpu/images/img_12.png +0 -0
- input_output/outpu/images/img_13.png +0 -0
- input_output/outpu/images/img_14.png +0 -0
- input_output/outpu/images/img_15.png +0 -0
- input_output/outpu/images/img_16.png +0 -0
- input_output/outpu/images/img_17.png +0 -0
- input_output/outpu/images/img_18.png +0 -0
- input_output/outpu/images/img_19.png +0 -0
- input_output/outpu/images/img_2.png +0 -0
- input_output/outpu/images/img_20.png +0 -0
- input_output/outpu/images/img_21.png +0 -0
- input_output/outpu/images/img_22.png +0 -0
- input_output/outpu/images/img_23.png +0 -0
- input_output/outpu/images/img_24.png +0 -0
- input_output/outpu/images/img_25.png +0 -0
- input_output/outpu/images/img_26.png +0 -0
- input_output/outpu/images/img_3.png +0 -0
- input_output/outpu/images/img_4.png +0 -0
- input_output/outpu/images/img_5.png +0 -0
- input_output/outpu/images/img_6.png +0 -0
- input_output/outpu/images/img_7.png +0 -0
- input_output/outpu/images/img_8.png +0 -0
- input_output/outpu/images/img_9.png +0 -0
- topic_extr.py +64 -156
__pycache__/contents_extractor_v2.cpython-310.pyc
DELETED
Binary file (7 kB)
|
|
__pycache__/mineru_test_local.cpython-310.pyc
DELETED
Binary file (11.9 kB)
|
|
__pycache__/topic_extraction_upgrade.cpython-310.pyc
DELETED
Binary file (10.9 kB)
|
|
input_output/outpu/images/img_1.png
ADDED
![]() |
input_output/outpu/images/img_10.png
ADDED
![]() |
input_output/outpu/images/img_11.png
ADDED
![]() |
input_output/outpu/images/img_12.png
ADDED
![]() |
input_output/outpu/images/img_13.png
ADDED
![]() |
input_output/outpu/images/img_14.png
ADDED
![]() |
input_output/outpu/images/img_15.png
ADDED
![]() |
input_output/outpu/images/img_16.png
ADDED
![]() |
input_output/outpu/images/img_17.png
ADDED
![]() |
input_output/outpu/images/img_18.png
ADDED
![]() |
input_output/outpu/images/img_19.png
ADDED
![]() |
input_output/outpu/images/img_2.png
ADDED
![]() |
input_output/outpu/images/img_20.png
ADDED
![]() |
input_output/outpu/images/img_21.png
ADDED
![]() |
input_output/outpu/images/img_22.png
ADDED
![]() |
input_output/outpu/images/img_23.png
ADDED
![]() |
input_output/outpu/images/img_24.png
ADDED
![]() |
input_output/outpu/images/img_25.png
ADDED
![]() |
input_output/outpu/images/img_26.png
ADDED
![]() |
input_output/outpu/images/img_3.png
ADDED
![]() |
input_output/outpu/images/img_4.png
ADDED
![]() |
input_output/outpu/images/img_5.png
ADDED
![]() |
input_output/outpu/images/img_6.png
ADDED
![]() |
input_output/outpu/images/img_7.png
ADDED
![]() |
input_output/outpu/images/img_8.png
ADDED
![]() |
input_output/outpu/images/img_9.png
ADDED
![]() |
topic_extr.py
CHANGED
@@ -10,34 +10,23 @@ import cv2
|
|
10 |
import numpy as np
|
11 |
from io import BytesIO
|
12 |
from typing import List, Dict, Any
|
|
|
13 |
|
14 |
import torch
|
15 |
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
from google.genai import types
|
20 |
-
except ImportError:
|
21 |
-
genai = None
|
22 |
-
types = None
|
23 |
|
24 |
-
# magic-pdf imports
|
25 |
from magic_pdf.data.dataset import PymuDocDataset
|
26 |
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
27 |
|
28 |
-
# table extraction logic
|
29 |
from table_row_extraction import TableExtractor
|
30 |
|
31 |
-
###############################################################################
|
32 |
-
# Logging Setup
|
33 |
-
###############################################################################
|
34 |
logging.basicConfig(level=logging.INFO)
|
35 |
logger = logging.getLogger(__name__)
|
36 |
logger.setLevel(logging.INFO)
|
37 |
|
38 |
-
###############################################################################
|
39 |
-
# PDF Subset Creation
|
40 |
-
###############################################################################
|
41 |
def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> bytes:
|
42 |
"""
|
43 |
Creates a new PDF (in memory) containing only the pages in page_indices (0-based).
|
@@ -59,20 +48,16 @@ def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> byt
|
|
59 |
doc.close()
|
60 |
return subset_bytes
|
61 |
|
62 |
-
###############################################################################
|
63 |
-
# Utility: Shrink Images Before Sending to Gemini
|
64 |
-
###############################################################################
|
65 |
def shrink_image_to_jpeg(image_data: bytes, max_dim: int = 800, jpeg_quality: int = 80) -> bytes:
|
66 |
"""
|
67 |
Decode image_data, resize so largest dimension <= max_dim, then re-encode as JPEG.
|
68 |
This reduces request size to Gemini significantly.
|
69 |
"""
|
70 |
try:
|
71 |
-
# Decode
|
72 |
arr = np.frombuffer(image_data, np.uint8)
|
73 |
img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
|
74 |
if img is None:
|
75 |
-
# Not a valid image, return as
|
76 |
return image_data
|
77 |
|
78 |
h, w, _ = img.shape
|
@@ -84,7 +69,6 @@ def shrink_image_to_jpeg(image_data: bytes, max_dim: int = 800, jpeg_quality: in
|
|
84 |
new_h = int(h * scale)
|
85 |
img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
|
86 |
|
87 |
-
# Re-encode
|
88 |
encode_params = [int(cv2.IMWRITE_JPEG_QUALITY), jpeg_quality]
|
89 |
success, enc = cv2.imencode(".jpg", img, encode_params)
|
90 |
if success:
|
@@ -96,15 +80,12 @@ def shrink_image_to_jpeg(image_data: bytes, max_dim: int = 800, jpeg_quality: in
|
|
96 |
logger.warning(f"shrink_image_to_jpeg error: {e}. Returning original data.")
|
97 |
return image_data
|
98 |
|
99 |
-
###############################################################################
|
100 |
-
# Gemini LLM - Subtopic Extraction
|
101 |
-
###############################################################################
|
102 |
class GeminiTopicExtractor:
|
103 |
"""
|
104 |
Reads the first few pages of a PDF to get the table of contents text,
|
105 |
then uses Gemini to parse out topics -> [start_page, end_page].
|
106 |
"""
|
107 |
-
def __init__(self, api_key: str = None, num_pages: int =
|
108 |
self.api_key = api_key or os.getenv("GEMINI_API_KEY", "")
|
109 |
if not self.api_key:
|
110 |
logger.warning("No Gemini API key provided for subtopic extraction.")
|
@@ -117,100 +98,32 @@ class GeminiTopicExtractor:
|
|
117 |
return {}
|
118 |
|
119 |
if genai is None or types is None:
|
120 |
-
logger.warning("google.genai
|
121 |
return {}
|
122 |
|
123 |
prompt = f"""
|
124 |
You will be provided with the first pages of an exam board document.
|
125 |
-
Your goal is to extract the main subject-related topics from the "Contents" section
|
126 |
-
|
|
|
127 |
1. Identify the 'Contents' section listing all topics, subtopics, and their corresponding pages.
|
128 |
2. Extract only the **highest-level, subject-related subtopics** (ignore administrative sections).
|
129 |
3. For each subtopic, return [start_page, end_page] (1-based).
|
130 |
4. Output valid JSON in the following format:
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
|
136 |
Important Notes:
|
137 |
-
- Ignore non-subject-related sections (e.g.,
|
|
|
138 |
- The extracted subtopics should represent major academic areas, not organizational or structural elements.
|
139 |
-
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
1 Introduction – 2
|
146 |
-
Why choose Edexcel A Level Mathematics? - 2
|
147 |
-
Supporting you in planning and implementing this qualification - 3
|
148 |
-
Qualification at a glance - 5
|
149 |
-
2 Subject content and assessment information – 7
|
150 |
-
Paper 1 and Paper 2: Pure Mathematics - 11
|
151 |
-
Paper 3: Statistics and Mechanics - 30
|
152 |
-
Assessment Objectives - 40
|
153 |
-
3 Administration and general information – 42
|
154 |
-
Entries - 42
|
155 |
-
Access arrangements, reasonable adjustments, special consideration and malpractice - 42
|
156 |
-
Student recruitment and progression - 45
|
157 |
-
Appendix 1: Formulae – 49
|
158 |
-
Appendix 2: Notation – 53
|
159 |
-
Appendix 3: Use of calculators – 59
|
160 |
-
Appendix 4: Assessment Objectives – 60
|
161 |
-
Appendix 5: The context for the development of this qualification – 62
|
162 |
-
Appendix 6: Transferable skills – 64
|
163 |
-
Appendix 7: Level 3 Extended Project qualification – 65
|
164 |
-
Appendix 8: Codes – 67
|
165 |
-
|
166 |
-
The correct output should be:
|
167 |
-
|
168 |
-
{{
|
169 |
-
"Paper 1 and Paper 2: Pure Mathematics": [11, 29],
|
170 |
-
"Paper 3: Statistics and Mechanics": [30, 42]
|
171 |
-
}}
|
172 |
-
|
173 |
-
2. Given this table of contents:
|
174 |
-
|
175 |
-
Qualification at a glance – 1
|
176 |
-
Assessment Objectives and weightings - 4
|
177 |
-
Knowledge, skills and understanding – 5
|
178 |
-
Theme 1: Introduction to markets and market failure - 5
|
179 |
-
Theme 2: The UK economy – performance and policies - 11
|
180 |
-
Theme 3: Business behaviour and the labour market - 21
|
181 |
-
Theme 4: A global perspective - 29
|
182 |
-
Assessment – 39
|
183 |
-
Assessment summary - 39
|
184 |
-
Assessment objectives - 41
|
185 |
-
Assessment overview - 42
|
186 |
-
Breakdown of assessment objectives - 42
|
187 |
-
Synoptic assessment - 43
|
188 |
-
Discount code and performance tables - 43
|
189 |
-
Access arrangements, reasonable adjustments and special consideration - 44
|
190 |
-
Malpractice - 45
|
191 |
-
Equality Act 2010 and Pearson equality policy - 45
|
192 |
-
Synoptic assessment - 46
|
193 |
-
Awarding and reporting - 47
|
194 |
-
Other information – 49
|
195 |
-
Student recruitment -49
|
196 |
-
Prior learning and other requirements -49
|
197 |
-
Progression - 49
|
198 |
-
Appendix 1: Transferable skills – 53
|
199 |
-
Appendix 2: Level 3 Extended Project qualification – 55
|
200 |
-
Appendix 3: Quantitative skills – 59
|
201 |
-
Appendix 4: Codes – 61
|
202 |
-
Appendix 5: Index – 63
|
203 |
-
|
204 |
-
The correct output should be:
|
205 |
-
|
206 |
-
{{
|
207 |
-
"Theme 1: Introduction to markets and market failure": [5, 10]
|
208 |
-
"Theme 2: The UK economy – performance and policies": - [11, 20]
|
209 |
-
"Theme 3: Business behaviour and the labour market": [21, 28]
|
210 |
-
"Theme 4: A global perspective": [29, 38]
|
211 |
-
}}
|
212 |
-
|
213 |
-
Now, extract topics from this text: {text_content}
|
214 |
"""
|
215 |
|
216 |
try:
|
@@ -246,9 +159,6 @@ Examples:
|
|
246 |
logger.error(f"Could not open/read PDF: {e}")
|
247 |
return "\n".join(text_parts)
|
248 |
|
249 |
-
###############################################################################
|
250 |
-
# Gemini-based Image Classification
|
251 |
-
###############################################################################
|
252 |
def call_gemini_for_table_classification(image_data: bytes, api_key: str) -> str:
|
253 |
"""
|
254 |
Classify an image as TWO_COLUMN, THREE_COLUMN, or NO_TABLE using Gemini (Flash).
|
@@ -261,7 +171,6 @@ def call_gemini_for_table_classification(image_data: bytes, api_key: str) -> str
|
|
261 |
logger.warning("google.genai not installed, returning NO_TABLE.")
|
262 |
return "NO_TABLE"
|
263 |
|
264 |
-
# Shrink image
|
265 |
shrunk_data = shrink_image_to_jpeg(image_data, max_dim=800, jpeg_quality=80)
|
266 |
|
267 |
prompt = """You are given an image. Determine if it shows a table that has exactly 2 or 3 columns.
|
@@ -285,14 +194,6 @@ THREE_COLUMN
|
|
285 |
NO_TABLE
|
286 |
"""
|
287 |
try:
|
288 |
-
# Example of optional manual timeout approach (commented out):
|
289 |
-
# import signal
|
290 |
-
# def handler(signum, frame):
|
291 |
-
# raise TimeoutError("Table classification timed out!")
|
292 |
-
# signal.signal(signal.SIGALRM, handler)
|
293 |
-
# signal.alarm(30) # 30s timeout
|
294 |
-
|
295 |
-
logger.debug("Sending image to Gemini for table classification...")
|
296 |
client = genai.Client(api_key=api_key)
|
297 |
response = client.models.generate_content(
|
298 |
model="gemini-2.0-flash",
|
@@ -311,8 +212,6 @@ NO_TABLE
|
|
311 |
],
|
312 |
config=types.GenerateContentConfig(temperature=0.0)
|
313 |
)
|
314 |
-
# signal.alarm(0) # cancel timeout
|
315 |
-
|
316 |
if response and response.text:
|
317 |
logger.info(f"[Gemini table classification] LLM raw response:\n{response.text}")
|
318 |
|
@@ -357,7 +256,6 @@ If the image is of a multiple-choice question’s options, then modify your answ
|
|
357 |
Otherwise, follow the above instructions strictly.
|
358 |
"""
|
359 |
try:
|
360 |
-
logger.debug("Sending image to Gemini for description...")
|
361 |
client = genai.Client(api_key=api_key)
|
362 |
response = client.models.generate_content(
|
363 |
model="gemini-2.0-flash",
|
@@ -384,14 +282,11 @@ Otherwise, follow the above instructions strictly.
|
|
384 |
logger.error(f"Gemini image description error: {e}")
|
385 |
return "Image description unavailable"
|
386 |
|
387 |
-
###############################################################################
|
388 |
-
# Local Image Writer (Sequential Gemini Calls)
|
389 |
-
###############################################################################
|
390 |
class LocalImageWriter:
|
391 |
"""
|
392 |
-
Saves extracted images,
|
393 |
-
|
394 |
-
|
395 |
into row/column cell images.
|
396 |
"""
|
397 |
def __init__(self, output_folder: str, gemini_api_key: str):
|
@@ -427,24 +322,46 @@ class LocalImageWriter:
|
|
427 |
|
428 |
def post_process(self, key: str, md_content: str) -> str:
|
429 |
"""
|
430 |
-
1)
|
431 |
-
2)
|
432 |
3) Replace placeholders in the Markdown with final alt text.
|
433 |
4) Process table images => row/col cell images => update Markdown.
|
434 |
5) Keep only image-reference lines in the final Markdown.
|
435 |
"""
|
436 |
-
# 1) Table classification
|
437 |
-
logger.info("Classifying images to detect tables (
|
438 |
-
|
439 |
-
|
440 |
-
self.descriptions
|
441 |
-
|
442 |
-
|
443 |
-
|
444 |
-
|
445 |
-
|
446 |
-
|
447 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
448 |
|
449 |
# For images classified as 2/3-column tables => set alt
|
450 |
for p, info in self.descriptions.items():
|
@@ -525,16 +442,13 @@ class LocalImageWriter:
|
|
525 |
|
526 |
return md_content
|
527 |
|
528 |
-
###############################################################################
|
529 |
-
# Mineru (magic-pdf) Pipeline with Page-Range Preprocessing
|
530 |
-
###############################################################################
|
531 |
class MineruNoTextProcessor:
|
532 |
"""
|
533 |
1) Extracts page ranges from the PDF's table of contents (via Gemini).
|
534 |
2) Creates a subset PDF in memory for those pages.
|
535 |
3) Runs magic-pdf analysis on the subset PDF.
|
536 |
4) Generates a Markdown file with images, including table images
|
537 |
-
split into row/column cells.
|
538 |
"""
|
539 |
def __init__(self, output_folder: str, gemini_api_key: str = None):
|
540 |
self.output_folder = output_folder
|
@@ -546,7 +460,7 @@ class MineruNoTextProcessor:
|
|
546 |
self.table_enable = False
|
547 |
self.language = "en"
|
548 |
|
549 |
-
self.subtopic_extractor = GeminiTopicExtractor(api_key=gemini_api_key, num_pages=
|
550 |
self.gemini_api_key = gemini_api_key or os.getenv("GEMINI_API_KEY", "")
|
551 |
|
552 |
def cleanup_gpu(self):
|
@@ -611,7 +525,7 @@ class MineruNoTextProcessor:
|
|
611 |
pipe_result = inference.pipe_ocr_mode(image_writer, lang=self.language)
|
612 |
md_content = pipe_result.get_markdown("local-unique-prefix/")
|
613 |
|
614 |
-
# 7) Post-process =>
|
615 |
final_markdown = image_writer.post_process("local-unique-prefix/", md_content)
|
616 |
|
617 |
# 8) Save final Markdown
|
@@ -642,17 +556,11 @@ class MineruNoTextProcessor:
|
|
642 |
logger.warning(f"Skipping topic '{topic}' with invalid range: {rng}")
|
643 |
return pages
|
644 |
|
645 |
-
###############################################################################
|
646 |
-
# Main Execution
|
647 |
-
###############################################################################
|
648 |
if __name__ == "__main__":
|
649 |
-
# Example usage:
|
650 |
input_pdf = "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf"
|
651 |
-
output_dir = "/home/user/app/input_output/
|
652 |
|
653 |
-
# Provide your Gemini API key (or rely on GEMINI_API_KEY env var).
|
654 |
gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
|
655 |
-
# gemini_key = "YOUR_GEMINI_API_KEY"
|
656 |
|
657 |
try:
|
658 |
processor = MineruNoTextProcessor(output_folder=output_dir, gemini_api_key=gemini_key)
|
|
|
10 |
import numpy as np
|
11 |
from io import BytesIO
|
12 |
from typing import List, Dict, Any
|
13 |
+
import concurrent.futures
|
14 |
|
15 |
import torch
|
16 |
|
17 |
+
from google import genai
|
18 |
+
from google.genai import types
|
19 |
+
|
|
|
|
|
|
|
|
|
20 |
|
|
|
21 |
from magic_pdf.data.dataset import PymuDocDataset
|
22 |
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
23 |
|
|
|
24 |
from table_row_extraction import TableExtractor
|
25 |
|
|
|
|
|
|
|
26 |
logging.basicConfig(level=logging.INFO)
|
27 |
logger = logging.getLogger(__name__)
|
28 |
logger.setLevel(logging.INFO)
|
29 |
|
|
|
|
|
|
|
30 |
def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> bytes:
|
31 |
"""
|
32 |
Creates a new PDF (in memory) containing only the pages in page_indices (0-based).
|
|
|
48 |
doc.close()
|
49 |
return subset_bytes
|
50 |
|
|
|
|
|
|
|
51 |
def shrink_image_to_jpeg(image_data: bytes, max_dim: int = 800, jpeg_quality: int = 80) -> bytes:
|
52 |
"""
|
53 |
Decode image_data, resize so largest dimension <= max_dim, then re-encode as JPEG.
|
54 |
This reduces request size to Gemini significantly.
|
55 |
"""
|
56 |
try:
|
|
|
57 |
arr = np.frombuffer(image_data, np.uint8)
|
58 |
img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
|
59 |
if img is None:
|
60 |
+
# Not a valid image, return as-is
|
61 |
return image_data
|
62 |
|
63 |
h, w, _ = img.shape
|
|
|
69 |
new_h = int(h * scale)
|
70 |
img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
|
71 |
|
|
|
72 |
encode_params = [int(cv2.IMWRITE_JPEG_QUALITY), jpeg_quality]
|
73 |
success, enc = cv2.imencode(".jpg", img, encode_params)
|
74 |
if success:
|
|
|
80 |
logger.warning(f"shrink_image_to_jpeg error: {e}. Returning original data.")
|
81 |
return image_data
|
82 |
|
|
|
|
|
|
|
83 |
class GeminiTopicExtractor:
|
84 |
"""
|
85 |
Reads the first few pages of a PDF to get the table of contents text,
|
86 |
then uses Gemini to parse out topics -> [start_page, end_page].
|
87 |
"""
|
88 |
+
def __init__(self, api_key: str = None, num_pages: int = 15):
|
89 |
self.api_key = api_key or os.getenv("GEMINI_API_KEY", "")
|
90 |
if not self.api_key:
|
91 |
logger.warning("No Gemini API key provided for subtopic extraction.")
|
|
|
98 |
return {}
|
99 |
|
100 |
if genai is None or types is None:
|
101 |
+
logger.warning("google.genai not installed. Returning empty subtopics.")
|
102 |
return {}
|
103 |
|
104 |
prompt = f"""
|
105 |
You will be provided with the first pages of an exam board document.
|
106 |
+
Your goal is to extract the main subject-related topics from the \"Contents\" section
|
107 |
+
and structure them in a valid JSON format.
|
108 |
+
Instructions:
|
109 |
1. Identify the 'Contents' section listing all topics, subtopics, and their corresponding pages.
|
110 |
2. Extract only the **highest-level, subject-related subtopics** (ignore administrative sections).
|
111 |
3. For each subtopic, return [start_page, end_page] (1-based).
|
112 |
4. Output valid JSON in the following format:
|
113 |
+
{{
|
114 |
+
"Topic A": [start_page, end_page],
|
115 |
+
"Topic B": [start_page, end_page]
|
116 |
+
}}
|
117 |
|
118 |
Important Notes:
|
119 |
+
- Ignore non-subject-related sections (e.g., 'Introduction', 'Exam Guidelines', 'Appendices',
|
120 |
+
'Assessment, Qualification at a glance').
|
121 |
- The extracted subtopics should represent major academic areas, not organizational or structural elements.
|
122 |
+
- Ignore including the main topic page as start, ONLY subtopic first page.
|
123 |
+
- Make sure that all of the pages for a subtopic are included; the end page should be (the start page of the
|
124 |
+
next topic) - 1.
|
125 |
+
|
126 |
+
Now, extract topics from this text: {text_content}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
"""
|
128 |
|
129 |
try:
|
|
|
159 |
logger.error(f"Could not open/read PDF: {e}")
|
160 |
return "\n".join(text_parts)
|
161 |
|
|
|
|
|
|
|
162 |
def call_gemini_for_table_classification(image_data: bytes, api_key: str) -> str:
|
163 |
"""
|
164 |
Classify an image as TWO_COLUMN, THREE_COLUMN, or NO_TABLE using Gemini (Flash).
|
|
|
171 |
logger.warning("google.genai not installed, returning NO_TABLE.")
|
172 |
return "NO_TABLE"
|
173 |
|
|
|
174 |
shrunk_data = shrink_image_to_jpeg(image_data, max_dim=800, jpeg_quality=80)
|
175 |
|
176 |
prompt = """You are given an image. Determine if it shows a table that has exactly 2 or 3 columns.
|
|
|
194 |
NO_TABLE
|
195 |
"""
|
196 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
197 |
client = genai.Client(api_key=api_key)
|
198 |
response = client.models.generate_content(
|
199 |
model="gemini-2.0-flash",
|
|
|
212 |
],
|
213 |
config=types.GenerateContentConfig(temperature=0.0)
|
214 |
)
|
|
|
|
|
215 |
if response and response.text:
|
216 |
logger.info(f"[Gemini table classification] LLM raw response:\n{response.text}")
|
217 |
|
|
|
256 |
Otherwise, follow the above instructions strictly.
|
257 |
"""
|
258 |
try:
|
|
|
259 |
client = genai.Client(api_key=api_key)
|
260 |
response = client.models.generate_content(
|
261 |
model="gemini-2.0-flash",
|
|
|
282 |
logger.error(f"Gemini image description error: {e}")
|
283 |
return "Image description unavailable"
|
284 |
|
|
|
|
|
|
|
285 |
class LocalImageWriter:
|
286 |
"""
|
287 |
+
Saves extracted images, then does concurrent Gemini classification
|
288 |
+
and description calls. Finally modifies the Markdown to replace
|
289 |
+
references with final alt text. Also processes table images
|
290 |
into row/column cell images.
|
291 |
"""
|
292 |
def __init__(self, output_folder: str, gemini_api_key: str):
|
|
|
322 |
|
323 |
def post_process(self, key: str, md_content: str) -> str:
|
324 |
"""
|
325 |
+
1) Table classification calls (concurrent).
|
326 |
+
2) Image description calls for non-table images (concurrent).
|
327 |
3) Replace placeholders in the Markdown with final alt text.
|
328 |
4) Process table images => row/col cell images => update Markdown.
|
329 |
5) Keep only image-reference lines in the final Markdown.
|
330 |
"""
|
331 |
+
# 1) Table classification (CONCURRENT)
|
332 |
+
logger.info("Classifying images to detect tables (concurrent)...")
|
333 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
|
334 |
+
future_map = {}
|
335 |
+
for p, info in self.descriptions.items():
|
336 |
+
fut = executor.submit(call_gemini_for_table_classification, info["data"], self.gemini_api_key)
|
337 |
+
future_map[fut] = p
|
338 |
+
|
339 |
+
for fut in concurrent.futures.as_completed(future_map):
|
340 |
+
path = future_map[fut]
|
341 |
+
try:
|
342 |
+
classification = fut.result()
|
343 |
+
self.descriptions[path]['table_classification'] = classification
|
344 |
+
except Exception as e:
|
345 |
+
logger.error(f"Error classifying table for image {path}: {e}")
|
346 |
+
self.descriptions[path]['table_classification'] = "NO_TABLE"
|
347 |
+
|
348 |
+
# 2) Image description (CONCURRENT), only for NO_TABLE images
|
349 |
+
logger.info("Generating image descriptions for non-table images (concurrent)...")
|
350 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
|
351 |
+
future_map_desc = {}
|
352 |
+
for p, info in self.descriptions.items():
|
353 |
+
if info['table_classification'] == "NO_TABLE":
|
354 |
+
fut = executor.submit(call_gemini_for_image_description, info["data"], self.gemini_api_key)
|
355 |
+
future_map_desc[fut] = p
|
356 |
+
|
357 |
+
for fut in concurrent.futures.as_completed(future_map_desc):
|
358 |
+
path = future_map_desc[fut]
|
359 |
+
try:
|
360 |
+
desc = fut.result()
|
361 |
+
self.descriptions[path]['final_alt'] = desc
|
362 |
+
except Exception as e:
|
363 |
+
logger.error(f"Error describing image {path}: {e}")
|
364 |
+
self.descriptions[path]['final_alt'] = "Image description unavailable"
|
365 |
|
366 |
# For images classified as 2/3-column tables => set alt
|
367 |
for p, info in self.descriptions.items():
|
|
|
442 |
|
443 |
return md_content
|
444 |
|
|
|
|
|
|
|
445 |
class MineruNoTextProcessor:
|
446 |
"""
|
447 |
1) Extracts page ranges from the PDF's table of contents (via Gemini).
|
448 |
2) Creates a subset PDF in memory for those pages.
|
449 |
3) Runs magic-pdf analysis on the subset PDF.
|
450 |
4) Generates a Markdown file with images, including table images
|
451 |
+
split into row/column cells, with concurrency for Gemini calls.
|
452 |
"""
|
453 |
def __init__(self, output_folder: str, gemini_api_key: str = None):
|
454 |
self.output_folder = output_folder
|
|
|
460 |
self.table_enable = False
|
461 |
self.language = "en"
|
462 |
|
463 |
+
self.subtopic_extractor = GeminiTopicExtractor(api_key=gemini_api_key, num_pages=15)
|
464 |
self.gemini_api_key = gemini_api_key or os.getenv("GEMINI_API_KEY", "")
|
465 |
|
466 |
def cleanup_gpu(self):
|
|
|
525 |
pipe_result = inference.pipe_ocr_mode(image_writer, lang=self.language)
|
526 |
md_content = pipe_result.get_markdown("local-unique-prefix/")
|
527 |
|
528 |
+
# 7) Post-process => concurrent table classification / description => final MD
|
529 |
final_markdown = image_writer.post_process("local-unique-prefix/", md_content)
|
530 |
|
531 |
# 8) Save final Markdown
|
|
|
556 |
logger.warning(f"Skipping topic '{topic}' with invalid range: {rng}")
|
557 |
return pages
|
558 |
|
|
|
|
|
|
|
559 |
if __name__ == "__main__":
|
|
|
560 |
input_pdf = "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf"
|
561 |
+
output_dir = "/home/user/app/input_output/outpu"
|
562 |
|
|
|
563 |
gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
|
|
|
564 |
|
565 |
try:
|
566 |
processor = MineruNoTextProcessor(output_folder=output_dir, gemini_api_key=gemini_key)
|