SkyNait commited on
Commit
b7d667b
·
1 Parent(s): b6c51c5

change the logic

Browse files
__pycache__/contents_extractor_v2.cpython-310.pyc DELETED
Binary file (7 kB)
 
__pycache__/mineru_test_local.cpython-310.pyc DELETED
Binary file (11.9 kB)
 
__pycache__/topic_extraction_upgrade.cpython-310.pyc DELETED
Binary file (10.9 kB)
 
input_output/outpu/images/img_1.png ADDED
input_output/outpu/images/img_10.png ADDED
input_output/outpu/images/img_11.png ADDED
input_output/outpu/images/img_12.png ADDED
input_output/outpu/images/img_13.png ADDED
input_output/outpu/images/img_14.png ADDED
input_output/outpu/images/img_15.png ADDED
input_output/outpu/images/img_16.png ADDED
input_output/outpu/images/img_17.png ADDED
input_output/outpu/images/img_18.png ADDED
input_output/outpu/images/img_19.png ADDED
input_output/outpu/images/img_2.png ADDED
input_output/outpu/images/img_20.png ADDED
input_output/outpu/images/img_21.png ADDED
input_output/outpu/images/img_22.png ADDED
input_output/outpu/images/img_23.png ADDED
input_output/outpu/images/img_24.png ADDED
input_output/outpu/images/img_25.png ADDED
input_output/outpu/images/img_26.png ADDED
input_output/outpu/images/img_3.png ADDED
input_output/outpu/images/img_4.png ADDED
input_output/outpu/images/img_5.png ADDED
input_output/outpu/images/img_6.png ADDED
input_output/outpu/images/img_7.png ADDED
input_output/outpu/images/img_8.png ADDED
input_output/outpu/images/img_9.png ADDED
topic_extr.py CHANGED
@@ -10,34 +10,23 @@ import cv2
10
  import numpy as np
11
  from io import BytesIO
12
  from typing import List, Dict, Any
 
13
 
14
  import torch
15
 
16
- # Try to import google.genai
17
- try:
18
- from google import genai
19
- from google.genai import types
20
- except ImportError:
21
- genai = None
22
- types = None
23
 
24
- # magic-pdf imports
25
  from magic_pdf.data.dataset import PymuDocDataset
26
  from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
27
 
28
- # table extraction logic
29
  from table_row_extraction import TableExtractor
30
 
31
- ###############################################################################
32
- # Logging Setup
33
- ###############################################################################
34
  logging.basicConfig(level=logging.INFO)
35
  logger = logging.getLogger(__name__)
36
  logger.setLevel(logging.INFO)
37
 
38
- ###############################################################################
39
- # PDF Subset Creation
40
- ###############################################################################
41
  def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> bytes:
42
  """
43
  Creates a new PDF (in memory) containing only the pages in page_indices (0-based).
@@ -59,20 +48,16 @@ def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> byt
59
  doc.close()
60
  return subset_bytes
61
 
62
- ###############################################################################
63
- # Utility: Shrink Images Before Sending to Gemini
64
- ###############################################################################
65
  def shrink_image_to_jpeg(image_data: bytes, max_dim: int = 800, jpeg_quality: int = 80) -> bytes:
66
  """
67
  Decode image_data, resize so largest dimension <= max_dim, then re-encode as JPEG.
68
  This reduces request size to Gemini significantly.
69
  """
70
  try:
71
- # Decode
72
  arr = np.frombuffer(image_data, np.uint8)
73
  img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
74
  if img is None:
75
- # Not a valid image, return as is
76
  return image_data
77
 
78
  h, w, _ = img.shape
@@ -84,7 +69,6 @@ def shrink_image_to_jpeg(image_data: bytes, max_dim: int = 800, jpeg_quality: in
84
  new_h = int(h * scale)
85
  img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
86
 
87
- # Re-encode
88
  encode_params = [int(cv2.IMWRITE_JPEG_QUALITY), jpeg_quality]
89
  success, enc = cv2.imencode(".jpg", img, encode_params)
90
  if success:
@@ -96,15 +80,12 @@ def shrink_image_to_jpeg(image_data: bytes, max_dim: int = 800, jpeg_quality: in
96
  logger.warning(f"shrink_image_to_jpeg error: {e}. Returning original data.")
97
  return image_data
98
 
99
- ###############################################################################
100
- # Gemini LLM - Subtopic Extraction
101
- ###############################################################################
102
  class GeminiTopicExtractor:
103
  """
104
  Reads the first few pages of a PDF to get the table of contents text,
105
  then uses Gemini to parse out topics -> [start_page, end_page].
106
  """
107
- def __init__(self, api_key: str = None, num_pages: int = 14):
108
  self.api_key = api_key or os.getenv("GEMINI_API_KEY", "")
109
  if not self.api_key:
110
  logger.warning("No Gemini API key provided for subtopic extraction.")
@@ -117,100 +98,32 @@ class GeminiTopicExtractor:
117
  return {}
118
 
119
  if genai is None or types is None:
120
- logger.warning("google.genai is not installed. Returning empty subtopics.")
121
  return {}
122
 
123
  prompt = f"""
124
  You will be provided with the first pages of an exam board document.
125
- Your goal is to extract the main subject-related topics from the "Contents" section and structure them in a valid JSON format.Instructions:
126
- Instructions:
 
127
  1. Identify the 'Contents' section listing all topics, subtopics, and their corresponding pages.
128
  2. Extract only the **highest-level, subject-related subtopics** (ignore administrative sections).
129
  3. For each subtopic, return [start_page, end_page] (1-based).
130
  4. Output valid JSON in the following format:
131
- {{
132
- "Topic A": [start_page, end_page],
133
- "Topic B": [start_page, end_page]
134
- }}
135
 
136
  Important Notes:
137
- - Ignore non-subject-related sections (e.g., "Introduction", "Exam Guidelines", "Appendices", "Assessment, Qualification at a glance").
 
138
  - The extracted subtopics should represent major academic areas, not organizational or structural elements.
139
- - Make sure that all of the pages for a subtopic are included, end page should be the -1 start page of the topic
140
- that comes next after the extracted one in contents section.
141
-
142
- Examples:
143
- 1. Given this table of contents:
144
-
145
- 1 Introduction – 2
146
- Why choose Edexcel A Level Mathematics? - 2
147
- Supporting you in planning and implementing this qualification - 3
148
- Qualification at a glance - 5
149
- 2 Subject content and assessment information – 7
150
- Paper 1 and Paper 2: Pure Mathematics - 11
151
- Paper 3: Statistics and Mechanics - 30
152
- Assessment Objectives - 40
153
- 3 Administration and general information – 42
154
- Entries - 42
155
- Access arrangements, reasonable adjustments, special consideration and malpractice - 42
156
- Student recruitment and progression - 45
157
- Appendix 1: Formulae – 49
158
- Appendix 2: Notation – 53
159
- Appendix 3: Use of calculators – 59
160
- Appendix 4: Assessment Objectives – 60
161
- Appendix 5: The context for the development of this qualification – 62
162
- Appendix 6: Transferable skills – 64
163
- Appendix 7: Level 3 Extended Project qualification – 65
164
- Appendix 8: Codes – 67
165
-
166
- The correct output should be:
167
-
168
- {{
169
- "Paper 1 and Paper 2: Pure Mathematics": [11, 29],
170
- "Paper 3: Statistics and Mechanics": [30, 42]
171
- }}
172
-
173
- 2. Given this table of contents:
174
-
175
- Qualification at a glance – 1
176
- Assessment Objectives and weightings - 4
177
- Knowledge, skills and understanding – 5
178
- Theme 1: Introduction to markets and market failure - 5
179
- Theme 2: The UK economy – performance and policies - 11
180
- Theme 3: Business behaviour and the labour market - 21
181
- Theme 4: A global perspective - 29
182
- Assessment – 39
183
- Assessment summary - 39
184
- Assessment objectives - 41
185
- Assessment overview - 42
186
- Breakdown of assessment objectives - 42
187
- Synoptic assessment - 43
188
- Discount code and performance tables - 43
189
- Access arrangements, reasonable adjustments and special consideration - 44
190
- Malpractice - 45
191
- Equality Act 2010 and Pearson equality policy - 45
192
- Synoptic assessment - 46
193
- Awarding and reporting - 47
194
- Other information – 49
195
- Student recruitment -49
196
- Prior learning and other requirements -49
197
- Progression - 49
198
- Appendix 1: Transferable skills – 53
199
- Appendix 2: Level 3 Extended Project qualification – 55
200
- Appendix 3: Quantitative skills – 59
201
- Appendix 4: Codes – 61
202
- Appendix 5: Index – 63
203
-
204
- The correct output should be:
205
-
206
- {{
207
- "Theme 1: Introduction to markets and market failure": [5, 10]
208
- "Theme 2: The UK economy – performance and policies": - [11, 20]
209
- "Theme 3: Business behaviour and the labour market": [21, 28]
210
- "Theme 4: A global perspective": [29, 38]
211
- }}
212
-
213
- Now, extract topics from this text: {text_content}
214
  """
215
 
216
  try:
@@ -246,9 +159,6 @@ Examples:
246
  logger.error(f"Could not open/read PDF: {e}")
247
  return "\n".join(text_parts)
248
 
249
- ###############################################################################
250
- # Gemini-based Image Classification
251
- ###############################################################################
252
  def call_gemini_for_table_classification(image_data: bytes, api_key: str) -> str:
253
  """
254
  Classify an image as TWO_COLUMN, THREE_COLUMN, or NO_TABLE using Gemini (Flash).
@@ -261,7 +171,6 @@ def call_gemini_for_table_classification(image_data: bytes, api_key: str) -> str
261
  logger.warning("google.genai not installed, returning NO_TABLE.")
262
  return "NO_TABLE"
263
 
264
- # Shrink image
265
  shrunk_data = shrink_image_to_jpeg(image_data, max_dim=800, jpeg_quality=80)
266
 
267
  prompt = """You are given an image. Determine if it shows a table that has exactly 2 or 3 columns.
@@ -285,14 +194,6 @@ THREE_COLUMN
285
  NO_TABLE
286
  """
287
  try:
288
- # Example of optional manual timeout approach (commented out):
289
- # import signal
290
- # def handler(signum, frame):
291
- # raise TimeoutError("Table classification timed out!")
292
- # signal.signal(signal.SIGALRM, handler)
293
- # signal.alarm(30) # 30s timeout
294
-
295
- logger.debug("Sending image to Gemini for table classification...")
296
  client = genai.Client(api_key=api_key)
297
  response = client.models.generate_content(
298
  model="gemini-2.0-flash",
@@ -311,8 +212,6 @@ NO_TABLE
311
  ],
312
  config=types.GenerateContentConfig(temperature=0.0)
313
  )
314
- # signal.alarm(0) # cancel timeout
315
-
316
  if response and response.text:
317
  logger.info(f"[Gemini table classification] LLM raw response:\n{response.text}")
318
 
@@ -357,7 +256,6 @@ If the image is of a multiple-choice question’s options, then modify your answ
357
  Otherwise, follow the above instructions strictly.
358
  """
359
  try:
360
- logger.debug("Sending image to Gemini for description...")
361
  client = genai.Client(api_key=api_key)
362
  response = client.models.generate_content(
363
  model="gemini-2.0-flash",
@@ -384,14 +282,11 @@ Otherwise, follow the above instructions strictly.
384
  logger.error(f"Gemini image description error: {e}")
385
  return "Image description unavailable"
386
 
387
- ###############################################################################
388
- # Local Image Writer (Sequential Gemini Calls)
389
- ###############################################################################
390
  class LocalImageWriter:
391
  """
392
- Saves extracted images, classifies them with Gemini for table/no-table,
393
- describes them if no-table, then modifies the Markdown to replace
394
- the original references with final alt text. Also processes table images
395
  into row/column cell images.
396
  """
397
  def __init__(self, output_folder: str, gemini_api_key: str):
@@ -427,24 +322,46 @@ class LocalImageWriter:
427
 
428
  def post_process(self, key: str, md_content: str) -> str:
429
  """
430
- 1) Classify images as table/no-table (sequential).
431
- 2) Describe non-table images (sequential).
432
  3) Replace placeholders in the Markdown with final alt text.
433
  4) Process table images => row/col cell images => update Markdown.
434
  5) Keep only image-reference lines in the final Markdown.
435
  """
436
- # 1) Table classification
437
- logger.info("Classifying images to detect tables (sequential)...")
438
- for p, info in self.descriptions.items():
439
- classification = call_gemini_for_table_classification(info["data"], self.gemini_api_key)
440
- self.descriptions[p]['table_classification'] = classification
441
-
442
- # 2) Image description for non-table
443
- logger.info("Generating image descriptions for non-table images (sequential)...")
444
- for p, info in self.descriptions.items():
445
- if info['table_classification'] == "NO_TABLE":
446
- desc = call_gemini_for_image_description(info["data"], self.gemini_api_key)
447
- info['final_alt'] = desc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
448
 
449
  # For images classified as 2/3-column tables => set alt
450
  for p, info in self.descriptions.items():
@@ -525,16 +442,13 @@ class LocalImageWriter:
525
 
526
  return md_content
527
 
528
- ###############################################################################
529
- # Mineru (magic-pdf) Pipeline with Page-Range Preprocessing
530
- ###############################################################################
531
  class MineruNoTextProcessor:
532
  """
533
  1) Extracts page ranges from the PDF's table of contents (via Gemini).
534
  2) Creates a subset PDF in memory for those pages.
535
  3) Runs magic-pdf analysis on the subset PDF.
536
  4) Generates a Markdown file with images, including table images
537
- split into row/column cells.
538
  """
539
  def __init__(self, output_folder: str, gemini_api_key: str = None):
540
  self.output_folder = output_folder
@@ -546,7 +460,7 @@ class MineruNoTextProcessor:
546
  self.table_enable = False
547
  self.language = "en"
548
 
549
- self.subtopic_extractor = GeminiTopicExtractor(api_key=gemini_api_key, num_pages=4)
550
  self.gemini_api_key = gemini_api_key or os.getenv("GEMINI_API_KEY", "")
551
 
552
  def cleanup_gpu(self):
@@ -611,7 +525,7 @@ class MineruNoTextProcessor:
611
  pipe_result = inference.pipe_ocr_mode(image_writer, lang=self.language)
612
  md_content = pipe_result.get_markdown("local-unique-prefix/")
613
 
614
- # 7) Post-process => classify table images => final MD
615
  final_markdown = image_writer.post_process("local-unique-prefix/", md_content)
616
 
617
  # 8) Save final Markdown
@@ -642,17 +556,11 @@ class MineruNoTextProcessor:
642
  logger.warning(f"Skipping topic '{topic}' with invalid range: {rng}")
643
  return pages
644
 
645
- ###############################################################################
646
- # Main Execution
647
- ###############################################################################
648
  if __name__ == "__main__":
649
- # Example usage:
650
  input_pdf = "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf"
651
- output_dir = "/home/user/app/input_output/output"
652
 
653
- # Provide your Gemini API key (or rely on GEMINI_API_KEY env var).
654
  gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
655
- # gemini_key = "YOUR_GEMINI_API_KEY"
656
 
657
  try:
658
  processor = MineruNoTextProcessor(output_folder=output_dir, gemini_api_key=gemini_key)
 
10
  import numpy as np
11
  from io import BytesIO
12
  from typing import List, Dict, Any
13
+ import concurrent.futures
14
 
15
  import torch
16
 
17
+ from google import genai
18
+ from google.genai import types
19
+
 
 
 
 
20
 
 
21
  from magic_pdf.data.dataset import PymuDocDataset
22
  from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
23
 
 
24
  from table_row_extraction import TableExtractor
25
 
 
 
 
26
  logging.basicConfig(level=logging.INFO)
27
  logger = logging.getLogger(__name__)
28
  logger.setLevel(logging.INFO)
29
 
 
 
 
30
  def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> bytes:
31
  """
32
  Creates a new PDF (in memory) containing only the pages in page_indices (0-based).
 
48
  doc.close()
49
  return subset_bytes
50
 
 
 
 
51
  def shrink_image_to_jpeg(image_data: bytes, max_dim: int = 800, jpeg_quality: int = 80) -> bytes:
52
  """
53
  Decode image_data, resize so largest dimension <= max_dim, then re-encode as JPEG.
54
  This reduces request size to Gemini significantly.
55
  """
56
  try:
 
57
  arr = np.frombuffer(image_data, np.uint8)
58
  img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
59
  if img is None:
60
+ # Not a valid image, return as-is
61
  return image_data
62
 
63
  h, w, _ = img.shape
 
69
  new_h = int(h * scale)
70
  img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
71
 
 
72
  encode_params = [int(cv2.IMWRITE_JPEG_QUALITY), jpeg_quality]
73
  success, enc = cv2.imencode(".jpg", img, encode_params)
74
  if success:
 
80
  logger.warning(f"shrink_image_to_jpeg error: {e}. Returning original data.")
81
  return image_data
82
 
 
 
 
83
  class GeminiTopicExtractor:
84
  """
85
  Reads the first few pages of a PDF to get the table of contents text,
86
  then uses Gemini to parse out topics -> [start_page, end_page].
87
  """
88
+ def __init__(self, api_key: str = None, num_pages: int = 15):
89
  self.api_key = api_key or os.getenv("GEMINI_API_KEY", "")
90
  if not self.api_key:
91
  logger.warning("No Gemini API key provided for subtopic extraction.")
 
98
  return {}
99
 
100
  if genai is None or types is None:
101
+ logger.warning("google.genai not installed. Returning empty subtopics.")
102
  return {}
103
 
104
  prompt = f"""
105
  You will be provided with the first pages of an exam board document.
106
+ Your goal is to extract the main subject-related topics from the \"Contents\" section
107
+ and structure them in a valid JSON format.
108
+ Instructions:
109
  1. Identify the 'Contents' section listing all topics, subtopics, and their corresponding pages.
110
  2. Extract only the **highest-level, subject-related subtopics** (ignore administrative sections).
111
  3. For each subtopic, return [start_page, end_page] (1-based).
112
  4. Output valid JSON in the following format:
113
+ {{
114
+ "Topic A": [start_page, end_page],
115
+ "Topic B": [start_page, end_page]
116
+ }}
117
 
118
  Important Notes:
119
+ - Ignore non-subject-related sections (e.g., 'Introduction', 'Exam Guidelines', 'Appendices',
120
+ 'Assessment, Qualification at a glance').
121
  - The extracted subtopics should represent major academic areas, not organizational or structural elements.
122
+ - Ignore including the main topic page as start, ONLY subtopic first page.
123
+ - Make sure that all of the pages for a subtopic are included; the end page should be (the start page of the
124
+ next topic) - 1.
125
+
126
+ Now, extract topics from this text: {text_content}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  """
128
 
129
  try:
 
159
  logger.error(f"Could not open/read PDF: {e}")
160
  return "\n".join(text_parts)
161
 
 
 
 
162
  def call_gemini_for_table_classification(image_data: bytes, api_key: str) -> str:
163
  """
164
  Classify an image as TWO_COLUMN, THREE_COLUMN, or NO_TABLE using Gemini (Flash).
 
171
  logger.warning("google.genai not installed, returning NO_TABLE.")
172
  return "NO_TABLE"
173
 
 
174
  shrunk_data = shrink_image_to_jpeg(image_data, max_dim=800, jpeg_quality=80)
175
 
176
  prompt = """You are given an image. Determine if it shows a table that has exactly 2 or 3 columns.
 
194
  NO_TABLE
195
  """
196
  try:
 
 
 
 
 
 
 
 
197
  client = genai.Client(api_key=api_key)
198
  response = client.models.generate_content(
199
  model="gemini-2.0-flash",
 
212
  ],
213
  config=types.GenerateContentConfig(temperature=0.0)
214
  )
 
 
215
  if response and response.text:
216
  logger.info(f"[Gemini table classification] LLM raw response:\n{response.text}")
217
 
 
256
  Otherwise, follow the above instructions strictly.
257
  """
258
  try:
 
259
  client = genai.Client(api_key=api_key)
260
  response = client.models.generate_content(
261
  model="gemini-2.0-flash",
 
282
  logger.error(f"Gemini image description error: {e}")
283
  return "Image description unavailable"
284
 
 
 
 
285
  class LocalImageWriter:
286
  """
287
+ Saves extracted images, then does concurrent Gemini classification
288
+ and description calls. Finally modifies the Markdown to replace
289
+ references with final alt text. Also processes table images
290
  into row/column cell images.
291
  """
292
  def __init__(self, output_folder: str, gemini_api_key: str):
 
322
 
323
  def post_process(self, key: str, md_content: str) -> str:
324
  """
325
+ 1) Table classification calls (concurrent).
326
+ 2) Image description calls for non-table images (concurrent).
327
  3) Replace placeholders in the Markdown with final alt text.
328
  4) Process table images => row/col cell images => update Markdown.
329
  5) Keep only image-reference lines in the final Markdown.
330
  """
331
+ # 1) Table classification (CONCURRENT)
332
+ logger.info("Classifying images to detect tables (concurrent)...")
333
+ with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
334
+ future_map = {}
335
+ for p, info in self.descriptions.items():
336
+ fut = executor.submit(call_gemini_for_table_classification, info["data"], self.gemini_api_key)
337
+ future_map[fut] = p
338
+
339
+ for fut in concurrent.futures.as_completed(future_map):
340
+ path = future_map[fut]
341
+ try:
342
+ classification = fut.result()
343
+ self.descriptions[path]['table_classification'] = classification
344
+ except Exception as e:
345
+ logger.error(f"Error classifying table for image {path}: {e}")
346
+ self.descriptions[path]['table_classification'] = "NO_TABLE"
347
+
348
+ # 2) Image description (CONCURRENT), only for NO_TABLE images
349
+ logger.info("Generating image descriptions for non-table images (concurrent)...")
350
+ with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
351
+ future_map_desc = {}
352
+ for p, info in self.descriptions.items():
353
+ if info['table_classification'] == "NO_TABLE":
354
+ fut = executor.submit(call_gemini_for_image_description, info["data"], self.gemini_api_key)
355
+ future_map_desc[fut] = p
356
+
357
+ for fut in concurrent.futures.as_completed(future_map_desc):
358
+ path = future_map_desc[fut]
359
+ try:
360
+ desc = fut.result()
361
+ self.descriptions[path]['final_alt'] = desc
362
+ except Exception as e:
363
+ logger.error(f"Error describing image {path}: {e}")
364
+ self.descriptions[path]['final_alt'] = "Image description unavailable"
365
 
366
  # For images classified as 2/3-column tables => set alt
367
  for p, info in self.descriptions.items():
 
442
 
443
  return md_content
444
 
 
 
 
445
  class MineruNoTextProcessor:
446
  """
447
  1) Extracts page ranges from the PDF's table of contents (via Gemini).
448
  2) Creates a subset PDF in memory for those pages.
449
  3) Runs magic-pdf analysis on the subset PDF.
450
  4) Generates a Markdown file with images, including table images
451
+ split into row/column cells, with concurrency for Gemini calls.
452
  """
453
  def __init__(self, output_folder: str, gemini_api_key: str = None):
454
  self.output_folder = output_folder
 
460
  self.table_enable = False
461
  self.language = "en"
462
 
463
+ self.subtopic_extractor = GeminiTopicExtractor(api_key=gemini_api_key, num_pages=15)
464
  self.gemini_api_key = gemini_api_key or os.getenv("GEMINI_API_KEY", "")
465
 
466
  def cleanup_gpu(self):
 
525
  pipe_result = inference.pipe_ocr_mode(image_writer, lang=self.language)
526
  md_content = pipe_result.get_markdown("local-unique-prefix/")
527
 
528
+ # 7) Post-process => concurrent table classification / description => final MD
529
  final_markdown = image_writer.post_process("local-unique-prefix/", md_content)
530
 
531
  # 8) Save final Markdown
 
556
  logger.warning(f"Skipping topic '{topic}' with invalid range: {rng}")
557
  return pages
558
 
 
 
 
559
  if __name__ == "__main__":
 
560
  input_pdf = "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf"
561
+ output_dir = "/home/user/app/input_output/outpu"
562
 
 
563
  gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
 
564
 
565
  try:
566
  processor = MineruNoTextProcessor(output_folder=output_dir, gemini_api_key=gemini_key)