SkyNait commited on
Commit
e06f439
·
1 Parent(s): b7d667b

correct page range handling

Browse files
__pycache__/inference_svm_model.cpython-310.pyc CHANGED
Binary files a/__pycache__/inference_svm_model.cpython-310.pyc and b/__pycache__/inference_svm_model.cpython-310.pyc differ
 
__pycache__/mineru_single.cpython-310.pyc CHANGED
Binary files a/__pycache__/mineru_single.cpython-310.pyc and b/__pycache__/mineru_single.cpython-310.pyc differ
 
__pycache__/table_row_extraction.cpython-310.pyc CHANGED
Binary files a/__pycache__/table_row_extraction.cpython-310.pyc and b/__pycache__/table_row_extraction.cpython-310.pyc differ
 
__pycache__/worker.cpython-310.pyc CHANGED
Binary files a/__pycache__/worker.cpython-310.pyc and b/__pycache__/worker.cpython-310.pyc differ
 
output/images/img_1.png ADDED
output/images/img_10.png ADDED
output/images/img_11.png ADDED
output/images/img_12.png ADDED
output/images/img_13.png ADDED
output/images/img_14.png ADDED
output/images/img_15.png ADDED
output/images/img_16.png ADDED
output/images/img_17.png ADDED
output/images/img_18.png ADDED
output/images/img_19.png ADDED
output/images/img_2.png ADDED
output/images/img_20.png ADDED
output/images/img_21.png ADDED
output/images/img_22.png ADDED
output/images/img_23.png ADDED
output/images/img_24.png ADDED
output/images/img_25.png ADDED
output/images/img_26.png ADDED
output/images/img_27.png ADDED
output/images/img_28.png ADDED
output/images/img_3.png ADDED
output/images/img_4.png ADDED
output/images/img_5.png ADDED
output/images/img_6.png ADDED
output/images/img_7.png ADDED
output/images/img_8.png ADDED
output/images/img_9.png ADDED
topic_extr.py CHANGED
@@ -6,95 +6,119 @@ import json
6
  import logging
7
  import fitz
8
  import base64
9
- import cv2
10
- import numpy as np
11
  from io import BytesIO
12
  from typing import List, Dict, Any
13
- import concurrent.futures
14
 
15
  import torch
 
 
16
 
17
- from google import genai
18
- from google.genai import types
19
-
 
 
 
 
20
 
 
21
  from magic_pdf.data.dataset import PymuDocDataset
22
  from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
23
 
 
24
  from table_row_extraction import TableExtractor
25
 
 
 
 
26
  logging.basicConfig(level=logging.INFO)
27
  logger = logging.getLogger(__name__)
28
  logger.setLevel(logging.INFO)
29
 
 
 
 
 
 
 
 
 
 
30
  def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> bytes:
31
  """
32
- Creates a new PDF (in memory) containing only the pages in page_indices (0-based).
 
33
  """
34
  if not page_indices:
35
  raise ValueError("No page indices provided for subset creation.")
36
 
37
  doc = fitz.open(stream=original_pdf_bytes, filetype="pdf")
38
  new_doc = fitz.open()
39
- sorted_pages = sorted(set(page_indices))
40
- for p in sorted_pages:
41
  if 0 <= p < doc.page_count:
42
  new_doc.insert_pdf(doc, from_page=p, to_page=p)
43
  else:
44
- logger.error(f"Page index {p} is out of range (0..{doc.page_count - 1}).")
45
- raise ValueError(f"Page index {p} is out of range.")
46
  subset_bytes = new_doc.tobytes()
47
  new_doc.close()
48
  doc.close()
49
  return subset_bytes
50
 
51
- def shrink_image_to_jpeg(image_data: bytes, max_dim: int = 800, jpeg_quality: int = 80) -> bytes:
 
 
 
52
  """
53
- Decode image_data, resize so largest dimension <= max_dim, then re-encode as JPEG.
54
- This reduces request size to Gemini significantly.
55
  """
56
- try:
57
- arr = np.frombuffer(image_data, np.uint8)
58
- img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
59
- if img is None:
60
- # Not a valid image, return as-is
61
- return image_data
62
-
63
- h, w, _ = img.shape
64
- scale = 1.0
65
- if max(h, w) > max_dim:
66
- scale = max_dim / float(max(h, w))
67
- if scale < 1.0:
68
- new_w = int(w * scale)
69
- new_h = int(h * scale)
70
- img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
71
-
72
- encode_params = [int(cv2.IMWRITE_JPEG_QUALITY), jpeg_quality]
73
- success, enc = cv2.imencode(".jpg", img, encode_params)
74
- if success:
75
- return enc.tobytes()
76
- else:
77
- logger.warning("Could not encode resized image, returning original.")
78
- return image_data
79
- except Exception as e:
80
- logger.warning(f"shrink_image_to_jpeg error: {e}. Returning original data.")
81
- return image_data
82
 
 
 
 
83
  class GeminiTopicExtractor:
84
  """
85
- Reads the first few pages of a PDF to get the table of contents text,
86
- then uses Gemini to parse out topics -> [start_page, end_page].
 
 
 
 
 
 
 
 
 
 
 
 
87
  """
88
- def __init__(self, api_key: str = None, num_pages: int = 15):
89
  self.api_key = api_key or os.getenv("GEMINI_API_KEY", "")
90
  if not self.api_key:
91
- logger.warning("No Gemini API key provided for subtopic extraction.")
92
  self.num_pages = num_pages
93
 
94
- def extract_subtopics(self, pdf_path: str) -> Dict[str, Any]:
95
- text_content = self._read_first_pages(pdf_path, self.num_pages)
96
- if not text_content.strip():
97
- logger.error("No text extracted from the first pages of the PDF.")
 
 
 
 
98
  return {}
99
 
100
  if genai is None or types is None:
@@ -102,100 +126,198 @@ class GeminiTopicExtractor:
102
  return {}
103
 
104
  prompt = f"""
105
- You will be provided with the first pages of an exam board document.
106
- Your goal is to extract the main subject-related topics from the \"Contents\" section
107
- and structure them in a valid JSON format.
108
  Instructions:
109
- 1. Identify the 'Contents' section listing all topics, subtopics, and their corresponding pages.
110
- 2. Extract only the **highest-level, subject-related subtopics** (ignore administrative sections).
111
- 3. For each subtopic, return [start_page, end_page] (1-based).
112
- 4. Output valid JSON in the following format:
113
- {{
114
- "Topic A": [start_page, end_page],
115
- "Topic B": [start_page, end_page]
116
- }}
117
-
118
- Important Notes:
119
- - Ignore non-subject-related sections (e.g., 'Introduction', 'Exam Guidelines', 'Appendices',
120
- 'Assessment, Qualification at a glance').
121
- - The extracted subtopics should represent major academic areas, not organizational or structural elements.
122
- - Ignore including the main topic page as start, ONLY subtopic first page.
123
- - Make sure that all of the pages for a subtopic are included; the end page should be (the start page of the
124
- next topic) - 1.
125
-
126
- Now, extract topics from this text: {text_content}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  """
128
-
129
  try:
130
- logger.debug("Calling Gemini to extract subtopics...")
131
  client = genai.Client(api_key=self.api_key)
132
  response = client.models.generate_content(
133
  model="gemini-2.0-flash",
134
  contents=[prompt],
135
  config=types.GenerateContentConfig(temperature=0.0)
136
  )
137
- # Log partial or full LLM response for debugging
138
- if response and response.text:
139
- logger.info(f"[Gemini subtopic extraction] Raw LLM response:\n{response.text}")
 
 
 
 
140
 
141
- raw_text = response.text.strip() if (response and response.text) else "{}"
142
- cleaned = raw_text.replace("```json", "").replace("```", "")
143
  data = json.loads(cleaned)
144
- return data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  except Exception as e:
146
  logger.error(f"Gemini subtopic extraction error: {e}")
147
  return {}
148
 
149
- def _read_first_pages(self, pdf_path: str, num_pages: int) -> str:
150
  text_parts = []
151
  try:
152
  doc = fitz.open(pdf_path)
153
- pages_to_read = min(doc.page_count, num_pages)
154
- for p in range(pages_to_read):
155
- page_text = doc.load_page(p).get_text()
156
- text_parts.append(page_text)
157
  doc.close()
158
  except Exception as e:
159
- logger.error(f"Could not open/read PDF: {e}")
160
  return "\n".join(text_parts)
161
 
 
 
 
162
  def call_gemini_for_table_classification(image_data: bytes, api_key: str) -> str:
163
  """
164
- Classify an image as TWO_COLUMN, THREE_COLUMN, or NO_TABLE using Gemini (Flash).
165
- We shrink the image first to speed up requests.
166
  """
167
  if not api_key:
168
- logger.warning("No Gemini API key found, returning NO_TABLE.")
169
  return "NO_TABLE"
170
- if not genai or not types:
171
- logger.warning("google.genai not installed, returning NO_TABLE.")
172
  return "NO_TABLE"
173
 
174
- shrunk_data = shrink_image_to_jpeg(image_data, max_dim=800, jpeg_quality=80)
175
-
176
- prompt = """You are given an image. Determine if it shows a table that has exactly 2 or 3 columns.
177
- The three-column 'table' image include such key features:
178
- - Three columns header columns
179
- - Headers like 'Topics', 'Content', 'Guidelines'
180
- - Numbered sections (e.g., 8.4, 9.1)
181
- - Educational curriculum-style structure
182
- The two-column 'table' image include such key features:
183
- - Two columns header columns
184
- - Headers like 'Subject content' and 'Additional information'
185
- - Numbered sections (e.g., 2.1, 3.4)
186
- - Educational curriculum-style structure
187
- - Bullet description in 'Additional information'
188
- If the image is a relevant table with 2 columns, respond with 'TWO_COLUMN'.
189
- If the image is a relevant table with 3 columns, respond with 'THREE_COLUMN'.
190
- If the image does not show a table at all, respond with 'NO_TABLE'.
191
- Return only one of these exact labels as your entire response:
 
 
 
 
 
192
  TWO_COLUMN
193
  THREE_COLUMN
194
  NO_TABLE
195
  """
196
  try:
197
  client = genai.Client(api_key=api_key)
198
- response = client.models.generate_content(
199
  model="gemini-2.0-flash",
200
  contents=[
201
  {
@@ -204,7 +326,7 @@ NO_TABLE
204
  {
205
  "inline_data": {
206
  "mime_type": "image/jpeg",
207
- "data": base64.b64encode(shrunk_data).decode('utf-8')
208
  }
209
  }
210
  ]
@@ -212,82 +334,23 @@ NO_TABLE
212
  ],
213
  config=types.GenerateContentConfig(temperature=0.0)
214
  )
215
- if response and response.text:
216
- logger.info(f"[Gemini table classification] LLM raw response:\n{response.text}")
217
-
218
- classification = (response.text.strip().upper()
219
- if (response and response.text) else "NO_TABLE")
220
- if "THREE" in classification:
221
- return "THREE_COLUMN"
222
- elif "TWO" in classification:
223
- return "TWO_COLUMN"
224
- else:
225
- return "NO_TABLE"
226
  except Exception as e:
227
  logger.error(f"Gemini table classification error: {e}")
228
  return "NO_TABLE"
229
 
230
- def call_gemini_for_image_description(image_data: bytes, api_key: str) -> str:
231
- """
232
- Use Gemini (Flash) to extract a short description from an image.
233
- We also shrink the image first to reduce request time.
234
- """
235
- if not api_key:
236
- logger.warning("No Gemini API key found, returning fallback description.")
237
- return "Image description unavailable"
238
- if not genai or not types:
239
- logger.warning("google.genai not installed, returning fallback description.")
240
- return "Image description unavailable"
241
-
242
- shrunk_data = shrink_image_to_jpeg(image_data, max_dim=800, jpeg_quality=80)
243
-
244
- prompt_text = """The provided image is a part of a question paper or markscheme.
245
- Extract all the necessary information from the image to be able to identify the question.
246
- To identify the question, we only need the following: question number and question part.
247
- Don't include redundant information.
248
- For example, if image contains text like: "Q1 Part A Answer: Life on earth was created by diety..."
249
- you should return just "Q1 Part A Mark Scheme"
250
- If there is no text on this image, return the description of the image. 20 words max.
251
- If there are not enough data, consider information from the surrounding context.
252
- Additionally, if the image contains a truncated part, you must describe it and mark as a
253
- part of some another image that goes before or after current image.
254
- If the image is of a multiple-choice question’s options, then modify your answer by appending
255
- 'MCQ: A [option] B [option] C [option] D [option]' (replacing [option] with the actual options).
256
- Otherwise, follow the above instructions strictly.
257
- """
258
- try:
259
- client = genai.Client(api_key=api_key)
260
- response = client.models.generate_content(
261
- model="gemini-2.0-flash",
262
- contents=[
263
- {
264
- "parts": [
265
- {"text": prompt_text},
266
- {
267
- "inline_data": {
268
- "mime_type": "image/jpeg",
269
- "data": base64.b64encode(shrunk_data).decode('utf-8')
270
- }
271
- }
272
- ]
273
- }
274
- ],
275
- config=types.GenerateContentConfig(temperature=0.0)
276
- )
277
- if response and response.text:
278
- logger.info(f"[Gemini image description] LLM raw response:\n{response.text}")
279
-
280
- return response.text.strip() if (response and response.text) else "Image description unavailable"
281
- except Exception as e:
282
- logger.error(f"Gemini image description error: {e}")
283
- return "Image description unavailable"
284
-
285
  class LocalImageWriter:
286
  """
287
- Saves extracted images, then does concurrent Gemini classification
288
- and description calls. Finally modifies the Markdown to replace
289
- references with final alt text. Also processes table images
290
- into row/column cell images.
291
  """
292
  def __init__(self, output_folder: str, gemini_api_key: str):
293
  self.output_folder = output_folder
@@ -301,77 +364,45 @@ class LocalImageWriter:
301
  self.gemini_api_key = gemini_api_key
302
 
303
  def write(self, path: str, data: bytes) -> None:
304
- """
305
- Called by magic-pdf to save each extracted image.
306
- We store metadata so we can classify the images later.
307
- """
308
  self._img_count += 1
309
- local_filename = f"img_{self._img_count}.png"
310
- local_path = os.path.join(self.images_dir, local_filename)
311
-
312
- with open(local_path, "wb") as f:
313
  f.write(data)
314
-
315
- rel_path_for_md = os.path.relpath(local_path, self.output_folder)
316
  self.descriptions[path] = {
317
  "data": data,
318
- "relative_path": rel_path_for_md,
319
  "table_classification": "NO_TABLE",
320
  "final_alt": ""
321
  }
322
 
323
  def post_process(self, key: str, md_content: str) -> str:
324
- """
325
- 1) Table classification calls (concurrent).
326
- 2) Image description calls for non-table images (concurrent).
327
- 3) Replace placeholders in the Markdown with final alt text.
328
- 4) Process table images => row/col cell images => update Markdown.
329
- 5) Keep only image-reference lines in the final Markdown.
330
- """
331
- # 1) Table classification (CONCURRENT)
332
  logger.info("Classifying images to detect tables (concurrent)...")
333
- with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
334
- future_map = {}
335
  for p, info in self.descriptions.items():
336
- fut = executor.submit(call_gemini_for_table_classification, info["data"], self.gemini_api_key)
337
- future_map[fut] = p
338
 
339
- for fut in concurrent.futures.as_completed(future_map):
340
- path = future_map[fut]
341
  try:
342
  classification = fut.result()
343
  self.descriptions[path]['table_classification'] = classification
344
  except Exception as e:
345
- logger.error(f"Error classifying table for image {path}: {e}")
346
  self.descriptions[path]['table_classification'] = "NO_TABLE"
347
 
348
- # 2) Image description (CONCURRENT), only for NO_TABLE images
349
- logger.info("Generating image descriptions for non-table images (concurrent)...")
350
- with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
351
- future_map_desc = {}
352
- for p, info in self.descriptions.items():
353
- if info['table_classification'] == "NO_TABLE":
354
- fut = executor.submit(call_gemini_for_image_description, info["data"], self.gemini_api_key)
355
- future_map_desc[fut] = p
356
-
357
- for fut in concurrent.futures.as_completed(future_map_desc):
358
- path = future_map_desc[fut]
359
- try:
360
- desc = fut.result()
361
- self.descriptions[path]['final_alt'] = desc
362
- except Exception as e:
363
- logger.error(f"Error describing image {path}: {e}")
364
- self.descriptions[path]['final_alt'] = "Image description unavailable"
365
-
366
- # For images classified as 2/3-column tables => set alt
367
  for p, info in self.descriptions.items():
368
  cls = info['table_classification']
369
  if cls == "TWO_COLUMN":
370
  info['final_alt'] = "HAS TO BE PROCESSED - two column table"
371
  elif cls == "THREE_COLUMN":
372
  info['final_alt'] = "HAS TO BE PROCESSED - three column table"
373
- elif not info['final_alt']:
374
- info['final_alt'] = "Image description unavailable"
375
 
376
  # 3) Replace placeholders in the Markdown
377
  for p, info in self.descriptions.items():
@@ -379,10 +410,10 @@ class LocalImageWriter:
379
  new_md = f"![{info['final_alt']}]({info['relative_path']})"
380
  md_content = md_content.replace(old_md, new_md)
381
 
382
- # 4) Process table images => row/col
383
  md_content = self._process_table_images_in_markdown(md_content)
384
 
385
- # 5) Keep only image-reference lines
386
  final_lines = []
387
  for line in md_content.split("\n"):
388
  if re.match(r"^\!\[.*\]\(.*\)", line.strip()):
@@ -391,13 +422,8 @@ class LocalImageWriter:
391
  return "\n".join(final_lines)
392
 
393
  def _process_table_images_in_markdown(self, md_content: str) -> str:
394
- """
395
- For images flagged as 2/3-column tables, run TableExtractor,
396
- split into row/column cell images, and replace the single
397
- table image reference with multiple cell references.
398
- """
399
- pattern = r"!\[HAS TO BE PROCESSED - (two|three) column table\]\(([^)]+)\)"
400
- matches = re.findall(pattern, md_content, flags=re.IGNORECASE)
401
  if not matches:
402
  return md_content
403
 
@@ -419,36 +445,38 @@ class LocalImageWriter:
419
  enable_subtopic_merge=False,
420
  subtopic_threshold=0.2
421
  )
422
-
423
  row_boxes = extractor.process_image(abs_image_path)
424
  out_folder = abs_image_path + "_rows"
425
  os.makedirs(out_folder, exist_ok=True)
426
  extractor.save_extracted_cells(abs_image_path, row_boxes, out_folder)
427
 
428
- snippet_lines = ["**Extracted table cells:**"]
429
  for i, row in enumerate(row_boxes):
430
  row_dir = os.path.join(out_folder, f"row_{i}")
431
  for j, _ in enumerate(row):
432
- cell_filename = f"col_{j}.png"
433
- cell_abs_path = os.path.join(row_dir, cell_filename)
434
- cell_rel_path = os.path.relpath(cell_abs_path, self.output_folder)
435
- snippet_lines.append(f"![Row {i} Col {j}]({cell_rel_path})")
436
 
437
- new_snippet = "\n".join(snippet_lines)
438
  old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({image_path})"
439
- md_content = md_content.replace(old_line, new_snippet)
440
  except Exception as e:
441
  logger.error(f"Error processing table image {image_path}: {e}")
442
 
443
  return md_content
444
 
 
 
 
445
  class MineruNoTextProcessor:
446
  """
447
- 1) Extracts page ranges from the PDF's table of contents (via Gemini).
448
- 2) Creates a subset PDF in memory for those pages.
449
- 3) Runs magic-pdf analysis on the subset PDF.
450
- 4) Generates a Markdown file with images, including table images
451
- split into row/column cells, with concurrency for Gemini calls.
452
  """
453
  def __init__(self, output_folder: str, gemini_api_key: str = None):
454
  self.output_folder = output_folder
@@ -456,11 +484,11 @@ class MineruNoTextProcessor:
456
 
457
  self.layout_model = "doclayout_yolo"
458
  self.formula_enable = True
459
- # keep table_enable=False so that entire table is an image
460
  self.table_enable = False
461
  self.language = "en"
462
 
463
- self.subtopic_extractor = GeminiTopicExtractor(api_key=gemini_api_key, num_pages=15)
 
464
  self.gemini_api_key = gemini_api_key or os.getenv("GEMINI_API_KEY", "")
465
 
466
  def cleanup_gpu(self):
@@ -474,41 +502,71 @@ class MineruNoTextProcessor:
474
  def process(self, pdf_path: str) -> str:
475
  logger.info(f"Processing PDF: {pdf_path}")
476
  try:
477
- # 1) Extract subtopics from the PDF's contents
478
- topics_data = self.subtopic_extractor.extract_subtopics(pdf_path)
479
- if not topics_data:
480
- raise ValueError("No valid topics extracted from the PDF's table of contents.")
481
-
482
- # 2) Flatten page indices from all topics (1-based)
483
- page_indices = self._collect_page_indices(topics_data)
484
- if not page_indices:
485
- raise ValueError("Extracted page indices are empty.")
486
 
487
- # 3) Read the original PDF into memory
488
  with open(pdf_path, "rb") as f:
489
- original_pdf_bytes = f.read()
490
-
491
- # 4) Validate pages and create subset (convert 1-based to 0-based)
492
- doc = fitz.open(stream=original_pdf_bytes, filetype="pdf")
493
  total_pages = doc.page_count
494
  doc.close()
495
 
496
- zero_based = []
497
- for p in page_indices:
498
- z = p - 1
499
- if 0 <= z < total_pages:
500
- zero_based.append(z)
501
- else:
502
- logger.error(f"Page {p} (converted to {z}) is out of 1..{total_pages}")
503
- raise ValueError(f"Page {p} is out of valid range.")
504
- zero_based = sorted(set(zero_based))
505
- if not zero_based:
506
- raise ValueError("No valid pages after conversion to 0-based indices.")
507
-
508
- logger.info(f"Processing pages (0-based): {zero_based}")
509
- subset_pdf_bytes = create_subset_pdf(original_pdf_bytes, zero_based)
510
-
511
- # 5) Run magic-pdf analysis on the subset PDF
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
512
  dataset = PymuDocDataset(subset_pdf_bytes)
513
  inference = doc_analyze(
514
  dataset,
@@ -520,52 +578,35 @@ class MineruNoTextProcessor:
520
  )
521
  logger.info("doc_analyze complete. Extracting images...")
522
 
523
- # 6) Convert to Markdown (images only) via pipe_ocr_mode
524
- image_writer = LocalImageWriter(self.output_folder, gemini_api_key=self.gemini_api_key)
525
- pipe_result = inference.pipe_ocr_mode(image_writer, lang=self.language)
526
  md_content = pipe_result.get_markdown("local-unique-prefix/")
527
 
528
- # 7) Post-process => concurrent table classification / description => final MD
529
- final_markdown = image_writer.post_process("local-unique-prefix/", md_content)
530
 
531
- # 8) Save final Markdown
532
- md_path = os.path.join(self.output_folder, "final_output.md")
533
- with open(md_path, "w", encoding="utf-8") as f:
534
  f.write(final_markdown)
535
 
536
- logger.info(f"Markdown saved to: {md_path}")
537
  return final_markdown
538
 
539
  finally:
540
  self.cleanup_gpu()
541
 
542
- def _collect_page_indices(self, topics_data: Dict[str, Any]) -> List[int]:
543
- """
544
- Flatten the subtopic ranges into a list of pages (1-based).
545
- Example: {"Topic A": [11,29], "Topic B": [30,42]} => [11..29, 30..42]
546
- """
547
- pages = []
548
- for topic, rng in topics_data.items():
549
- if isinstance(rng, list) and len(rng) == 2:
550
- start_p, end_p = rng
551
- if start_p > end_p:
552
- logger.error(f"Invalid page range for topic '{topic}': {rng}")
553
- raise ValueError(f"Invalid page range for topic '{topic}': {rng}")
554
- pages.extend(range(start_p, end_p + 1))
555
- else:
556
- logger.warning(f"Skipping topic '{topic}' with invalid range: {rng}")
557
- return pages
558
-
559
  if __name__ == "__main__":
560
  input_pdf = "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf"
561
- output_dir = "/home/user/app/input_output/outpu"
562
-
563
  gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
564
 
565
  try:
566
  processor = MineruNoTextProcessor(output_folder=output_dir, gemini_api_key=gemini_key)
567
- final_md = processor.process(input_pdf)
568
  print("Final Markdown Output:")
569
- print(final_md)
570
  except Exception as e:
571
  logger.error(f"Processing failed: {e}")
 
6
  import logging
7
  import fitz
8
  import base64
9
+ import concurrent.futures
 
10
  from io import BytesIO
11
  from typing import List, Dict, Any
 
12
 
13
  import torch
14
+ import cv2
15
+ import numpy as np
16
 
17
+ # Attempt top-level import of google.genai
18
+ try:
19
+ from google import genai
20
+ from google.genai import types
21
+ except ImportError:
22
+ genai = None
23
+ types = None
24
 
25
+ # magic-pdf imports
26
  from magic_pdf.data.dataset import PymuDocDataset
27
  from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
28
 
29
+ # table extraction logic
30
  from table_row_extraction import TableExtractor
31
 
32
+ ###############################################################################
33
+ # Logging Setup
34
+ ###############################################################################
35
  logging.basicConfig(level=logging.INFO)
36
  logger = logging.getLogger(__name__)
37
  logger.setLevel(logging.INFO)
38
 
39
+ ###############################################################################
40
+ # PDF Utility Functions
41
+ ###############################################################################
42
+ def unify_whitespace(text: str) -> str:
43
+ """
44
+ Replace runs of whitespace with a single space, strip leading/trailing, then lowercase.
45
+ """
46
+ return re.sub(r"\s+", " ", text).strip().lower()
47
+
48
  def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> bytes:
49
  """
50
+ Creates a new PDF (in memory) containing only pages in page_indices (0-based).
51
+ Raises ValueError if page_indices is empty or out of range.
52
  """
53
  if not page_indices:
54
  raise ValueError("No page indices provided for subset creation.")
55
 
56
  doc = fitz.open(stream=original_pdf_bytes, filetype="pdf")
57
  new_doc = fitz.open()
58
+ for p in sorted(set(page_indices)):
 
59
  if 0 <= p < doc.page_count:
60
  new_doc.insert_pdf(doc, from_page=p, to_page=p)
61
  else:
62
+ logger.error(f"Page index {p} out of range (0..{doc.page_count - 1}).")
63
+ raise ValueError(f"Page index {p} out of range.")
64
  subset_bytes = new_doc.tobytes()
65
  new_doc.close()
66
  doc.close()
67
  return subset_bytes
68
 
69
+ ###############################################################################
70
+ # Searching in PDF
71
+ ###############################################################################
72
+ def find_all_occurrences(pdf_bytes: bytes, search_text: str) -> List[int]:
73
  """
74
+ Return a sorted list of 0-based pages in which `search_text` (normalized) appears,
75
+ scanning the entire PDF in RAW mode.
76
  """
77
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
78
+ st_norm = unify_whitespace(search_text)
79
+ found = []
80
+ for i in range(doc.page_count):
81
+ raw = doc[i].get_text("raw")
82
+ norm = unify_whitespace(raw)
83
+ if st_norm in norm:
84
+ found.append(i)
85
+ doc.close()
86
+ return sorted(found)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
+ ###############################################################################
89
+ # Gemini LLM for Subtopic Extraction
90
+ ###############################################################################
91
  class GeminiTopicExtractor:
92
  """
93
+ Extract subtopics from the PDF by reading the first `num_pages` pages, calling Gemini.
94
+ We expect a structure like:
95
+ {
96
+ "2 Subject content and assessment information": {
97
+ "Paper 1 and Paper 2: Pure Mathematics": [11, 29],
98
+ "Paper 3: Statistics and Mechanics": [30, 42]
99
+ }
100
+ }
101
+ or sometimes just a flat dict:
102
+ {
103
+ "Paper 1 and Paper 2: Pure Mathematics": [15, 33],
104
+ "Paper 3: Statistics and Mechanics": [34, 46]
105
+ }
106
+ We'll parse both forms.
107
  """
108
+ def __init__(self, api_key: str = None, num_pages: int = 10):
109
  self.api_key = api_key or os.getenv("GEMINI_API_KEY", "")
110
  if not self.api_key:
111
+ logger.warning("No Gemini API key for subtopic extraction.")
112
  self.num_pages = num_pages
113
 
114
+ def extract_subtopics(self, pdf_path: str) -> Dict[str, List[int]]:
115
+ """
116
+ Return a dict of subtopics => [start_page, end_page].
117
+ Could be empty if parsing fails or the LLM can't find subtopics.
118
+ """
119
+ first_pages_text = self._read_first_pages_raw(pdf_path, self.num_pages)
120
+ if not first_pages_text.strip():
121
+ logger.error("No text from first pages => cannot extract subtopics.")
122
  return {}
123
 
124
  if genai is None or types is None:
 
126
  return {}
127
 
128
  prompt = f"""
129
+ You have the first pages of a PDF specification, including a table of contents.
130
+
 
131
  Instructions:
132
+ 1. Identify the 'Contents' section listing all topics, subtopics, and their corresponding pages.
133
+ 2. Extract subtopic names -> [start_page, end_page], in valid JSON format only.
134
+ 3. If you can't find any subtopics, return an empty JSON.
135
+
136
+ Examples:
137
+
138
+ 1. Given this table of contents:
139
+
140
+ 1 Introduction – 2
141
+ Why choose Edexcel A Level Mathematics? - 2
142
+ Supporting you in planning and implementing this qualification - 3
143
+ Qualification at a glance - 5
144
+ 2 Subject content and assessment information 7
145
+ Paper 1 and Paper 2: Pure Mathematics - 11
146
+ Paper 3: Statistics and Mechanics - 30
147
+ Assessment Objectives - 40
148
+ 3 Administration and general information – 42
149
+ Entries - 42
150
+ Access arrangements, reasonable adjustments, special consideration and malpractice - 42
151
+ Student recruitment and progression - 45
152
+ Appendix 1: Formulae – 49
153
+ Appendix 2: Notation – 53
154
+ Appendix 3: Use of calculators – 59
155
+ Appendix 4: Assessment Objectives – 60
156
+ Appendix 5: The context for the development of this qualification – 62
157
+ Appendix 6: Transferable skills – 64
158
+ Appendix 7: Level 3 Extended Project qualification – 65
159
+ Appendix 8: Codes – 67
160
+
161
+ The correct output should be:
162
+
163
+ {{
164
+ "Paper 1 and Paper 2: Pure Mathematics": [11, 29],
165
+ "Paper 3: Statistics and Mechanics": [30, 42]
166
+ }}
167
+
168
+ 2. Given this table of contents:
169
+
170
+ Qualification at a glance – 1
171
+ Assessment Objectives and weightings - 4
172
+ Knowledge, skills and understanding – 5
173
+ Theme 1: Introduction to markets and market failure - 5
174
+ Theme 2: The UK economy – performance and policies - 11
175
+ Theme 3: Business behaviour and the labour market - 21
176
+ Theme 4: A global perspective - 29
177
+ Assessment – 39
178
+ Assessment summary - 39
179
+ Assessment objectives - 41
180
+ Assessment overview - 42
181
+ Breakdown of assessment objectives - 42
182
+ Synoptic assessment - 43
183
+ Discount code and performance tables - 43
184
+ Access arrangements, reasonable adjustments and special consideration - 44
185
+ Malpractice - 45
186
+ Equality Act 2010 and Pearson equality policy - 45
187
+ Synoptic assessment - 46
188
+ Awarding and reporting - 47
189
+ Other information – 49
190
+ Student recruitment -49
191
+ Prior learning and other requirements -49
192
+ Progression - 49
193
+ Appendix 1: Transferable skills – 53
194
+ Appendix 2: Level 3 Extended Project qualification – 55
195
+ Appendix 3: Quantitative skills – 59
196
+ Appendix 4: Codes – 61
197
+ Appendix 5: Index – 63
198
+
199
+ The correct output should be:
200
+
201
+ {{
202
+ "Theme 1: Introduction to markets and market failure": [5, 10]
203
+ "Theme 2: The UK economy – performance and policies": - [11, 20]
204
+ "Theme 3: Business behaviour and the labour market": [21, 28]
205
+ "Theme 4: A global perspective": [29, 38]
206
+ }}
207
+
208
+ Now, extract topics from this text:
209
+ {first_pages_text}
210
  """
 
211
  try:
 
212
  client = genai.Client(api_key=self.api_key)
213
  response = client.models.generate_content(
214
  model="gemini-2.0-flash",
215
  contents=[prompt],
216
  config=types.GenerateContentConfig(temperature=0.0)
217
  )
218
+ if not response or not response.text:
219
+ logger.warning("No text from LLM => returning empty subtopics.")
220
+ return {}
221
+
222
+ raw_json = response.text.strip()
223
+ # Clean up triple backticks
224
+ cleaned = raw_json.replace("```json", "").replace("```", "")
225
 
226
+ # Attempt to parse
 
227
  data = json.loads(cleaned)
228
+ # data might be nested or flat
229
+ # if nested, e.g. {"2 Subject content": {"Paper 1...": [11,29]}}
230
+ # if flat, e.g. {"Paper 1...": [11,29]}
231
+ # We'll unify it to a single dict of subname => [start,end].
232
+ final_dict = {}
233
+
234
+ # If the top-level is a dict of dict
235
+ # We look for a dict whose values are themselves subtopics
236
+ # Or it might be a direct subtopic dict
237
+ # We'll try a quick approach:
238
+ # - If any top-level value is a dict with numeric arrays, use that
239
+ # - else assume data is the direct subtopic dict
240
+ found_sub_dict = None
241
+ for k, v in data.items():
242
+ if isinstance(v, dict):
243
+ # might be the sub-sub dict
244
+ found_sub_dict = v
245
+ break
246
+
247
+ if found_sub_dict is not None:
248
+ # parse found_sub_dict
249
+ for subk, rng in found_sub_dict.items():
250
+ if isinstance(rng, list) and len(rng) == 2:
251
+ final_dict[subk] = rng
252
+ else:
253
+ # maybe data is the direct subtopic dict
254
+ # parse data
255
+ for subk, rng in data.items():
256
+ if isinstance(rng, list) and len(rng) == 2:
257
+ final_dict[subk] = rng
258
+
259
+ return final_dict
260
  except Exception as e:
261
  logger.error(f"Gemini subtopic extraction error: {e}")
262
  return {}
263
 
264
+ def _read_first_pages_raw(self, pdf_path: str, num_pages: int) -> str:
265
  text_parts = []
266
  try:
267
  doc = fitz.open(pdf_path)
268
+ pages_to_read = min(num_pages, doc.page_count)
269
+ for i in range(pages_to_read):
270
+ raw_text = doc[i].get_text("raw")
271
+ text_parts.append(raw_text)
272
  doc.close()
273
  except Exception as e:
274
+ logger.error(f"Could not open PDF: {e}")
275
  return "\n".join(text_parts)
276
 
277
+ ###############################################################################
278
+ # Concurrency for Table Classification
279
+ ###############################################################################
280
  def call_gemini_for_table_classification(image_data: bytes, api_key: str) -> str:
281
  """
282
+ Classify an image as TWO_COLUMN, THREE_COLUMN, or NO_TABLE using Gemini.
 
283
  """
284
  if not api_key:
285
+ logger.warning("No Gemini API key => NO_TABLE.")
286
  return "NO_TABLE"
287
+ if genai is None or types is None:
288
+ logger.warning("google.genai not installed => NO_TABLE.")
289
  return "NO_TABLE"
290
 
291
+ # Attempt to shrink
292
+ try:
293
+ arr = np.frombuffer(image_data, np.uint8)
294
+ img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
295
+ if img is not None:
296
+ h, w, _ = img.shape
297
+ max_dim = 800
298
+ scale = 1.0
299
+ if max(h, w) > max_dim:
300
+ scale = max_dim / float(max(h, w))
301
+ if scale < 1.0:
302
+ new_w = int(w * scale)
303
+ new_h = int(h * scale)
304
+ img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
305
+ encode_params = [int(cv2.IMWRITE_JPEG_QUALITY), 70]
306
+ success, enc = cv2.imencode(".jpg", img, encode_params)
307
+ if success:
308
+ image_data = enc.tobytes()
309
+ except Exception as e:
310
+ logger.warning(f"shrink_image_to_jpeg error: {e}")
311
+
312
+ prompt = """You are given an image. Determine if it shows a table that has exactly 2 or 3 columns.
313
+ Return only one label:
314
  TWO_COLUMN
315
  THREE_COLUMN
316
  NO_TABLE
317
  """
318
  try:
319
  client = genai.Client(api_key=api_key)
320
+ resp = client.models.generate_content(
321
  model="gemini-2.0-flash",
322
  contents=[
323
  {
 
326
  {
327
  "inline_data": {
328
  "mime_type": "image/jpeg",
329
+ "data": base64.b64encode(image_data).decode('utf-8')
330
  }
331
  }
332
  ]
 
334
  ],
335
  config=types.GenerateContentConfig(temperature=0.0)
336
  )
337
+ if resp and resp.text:
338
+ classification = resp.text.strip().upper()
339
+ if "THREE" in classification:
340
+ return "THREE_COLUMN"
341
+ elif "TWO" in classification:
342
+ return "TWO_COLUMN"
343
+ return "NO_TABLE"
 
 
 
 
344
  except Exception as e:
345
  logger.error(f"Gemini table classification error: {e}")
346
  return "NO_TABLE"
347
 
348
+ ###############################################################################
349
+ # LocalImageWriter
350
+ ###############################################################################
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
  class LocalImageWriter:
352
  """
353
+ Writes extracted images, then does concurrency-based table classification calls.
 
 
 
354
  """
355
  def __init__(self, output_folder: str, gemini_api_key: str):
356
  self.output_folder = output_folder
 
364
  self.gemini_api_key = gemini_api_key
365
 
366
  def write(self, path: str, data: bytes) -> None:
 
 
 
 
367
  self._img_count += 1
368
+ fname = f"img_{self._img_count}.png"
369
+ fpath = os.path.join(self.images_dir, fname)
370
+ with open(fpath, "wb") as f:
 
371
  f.write(data)
372
+ rel_path = os.path.relpath(fpath, self.output_folder)
 
373
  self.descriptions[path] = {
374
  "data": data,
375
+ "relative_path": rel_path,
376
  "table_classification": "NO_TABLE",
377
  "final_alt": ""
378
  }
379
 
380
  def post_process(self, key: str, md_content: str) -> str:
 
 
 
 
 
 
 
 
381
  logger.info("Classifying images to detect tables (concurrent)...")
382
+ with concurrent.futures.ThreadPoolExecutor(max_workers=6) as exe:
383
+ fut_map = {}
384
  for p, info in self.descriptions.items():
385
+ fut = exe.submit(call_gemini_for_table_classification, info["data"], self.gemini_api_key)
386
+ fut_map[fut] = p
387
 
388
+ for fut in concurrent.futures.as_completed(fut_map):
389
+ path = fut_map[fut]
390
  try:
391
  classification = fut.result()
392
  self.descriptions[path]['table_classification'] = classification
393
  except Exception as e:
394
+ logger.error(f"Table classification error: {e}")
395
  self.descriptions[path]['table_classification'] = "NO_TABLE"
396
 
397
+ # 2) Set final alt text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
398
  for p, info in self.descriptions.items():
399
  cls = info['table_classification']
400
  if cls == "TWO_COLUMN":
401
  info['final_alt'] = "HAS TO BE PROCESSED - two column table"
402
  elif cls == "THREE_COLUMN":
403
  info['final_alt'] = "HAS TO BE PROCESSED - three column table"
404
+ else:
405
+ info['final_alt'] = "NO_TABLE image"
406
 
407
  # 3) Replace placeholders in the Markdown
408
  for p, info in self.descriptions.items():
 
410
  new_md = f"![{info['final_alt']}]({info['relative_path']})"
411
  md_content = md_content.replace(old_md, new_md)
412
 
413
+ # 4) If any table images => extract rows
414
  md_content = self._process_table_images_in_markdown(md_content)
415
 
416
+ # 5) Keep only lines that are image references
417
  final_lines = []
418
  for line in md_content.split("\n"):
419
  if re.match(r"^\!\[.*\]\(.*\)", line.strip()):
 
422
  return "\n".join(final_lines)
423
 
424
  def _process_table_images_in_markdown(self, md_content: str) -> str:
425
+ pat = r"!\[HAS TO BE PROCESSED - (two|three) column table\]\(([^)]+)\)"
426
+ matches = re.findall(pat, md_content, flags=re.IGNORECASE)
 
 
 
 
 
427
  if not matches:
428
  return md_content
429
 
 
445
  enable_subtopic_merge=False,
446
  subtopic_threshold=0.2
447
  )
 
448
  row_boxes = extractor.process_image(abs_image_path)
449
  out_folder = abs_image_path + "_rows"
450
  os.makedirs(out_folder, exist_ok=True)
451
  extractor.save_extracted_cells(abs_image_path, row_boxes, out_folder)
452
 
453
+ snippet = ["**Extracted table cells:**"]
454
  for i, row in enumerate(row_boxes):
455
  row_dir = os.path.join(out_folder, f"row_{i}")
456
  for j, _ in enumerate(row):
457
+ cell_file = f"col_{j}.png"
458
+ cell_path = os.path.join(row_dir, cell_file)
459
+ relp = os.path.relpath(cell_path, self.output_folder)
460
+ snippet.append(f"![Row {i} Col {j}]({relp})")
461
 
462
+ new_snip = "\n".join(snippet)
463
  old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({image_path})"
464
+ md_content = md_content.replace(old_line, new_snip)
465
  except Exception as e:
466
  logger.error(f"Error processing table image {image_path}: {e}")
467
 
468
  return md_content
469
 
470
+ ###############################################################################
471
+ # MineruNoTextProcessor
472
+ ###############################################################################
473
  class MineruNoTextProcessor:
474
  """
475
+ 1) Use Gemini to get subtopics => e.g. {"Paper 1 and Paper 2: Pure Mathematics": [11,29], ...}
476
+ 2) For each subtopic name => find real occurrence in PDF at or after (start_page-1).
477
+ 3) offset = occurrence_page - (start_page-1). clamp offset >= 0
478
+ 4) Flatten final pages, subset PDF, run magic-pdf => concurrency => final MD
479
+ 5) If no subtopics found, process entire PDF as fallback.
480
  """
481
  def __init__(self, output_folder: str, gemini_api_key: str = None):
482
  self.output_folder = output_folder
 
484
 
485
  self.layout_model = "doclayout_yolo"
486
  self.formula_enable = True
 
487
  self.table_enable = False
488
  self.language = "en"
489
 
490
+ # Use our new flexible approach
491
+ self.subtopic_extractor = GeminiTopicExtractor(api_key=gemini_api_key, num_pages=10)
492
  self.gemini_api_key = gemini_api_key or os.getenv("GEMINI_API_KEY", "")
493
 
494
  def cleanup_gpu(self):
 
502
  def process(self, pdf_path: str) -> str:
503
  logger.info(f"Processing PDF: {pdf_path}")
504
  try:
505
+ # 1) Extract subtopics from Gemini
506
+ subtopics = self.subtopic_extractor.extract_subtopics(pdf_path)
507
+ logger.info(f"Gemini returned subtopics: {subtopics}")
 
 
 
 
 
 
508
 
509
+ # 2) Read entire PDF
510
  with open(pdf_path, "rb") as f:
511
+ pdf_bytes = f.read()
512
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
 
 
513
  total_pages = doc.page_count
514
  doc.close()
515
 
516
+ final_pages = set()
517
+ if not subtopics:
518
+ logger.warning("No subtopics found. We'll process the entire PDF as fallback.")
519
+ final_pages = set(range(total_pages))
520
+ else:
521
+ # For each subtopic, find occurrence >= (start_p-1)
522
+ for subname, rng in subtopics.items():
523
+ if not (isinstance(rng, list) and len(rng) == 2):
524
+ logger.warning(f"Skipping subtopic '{subname}' => invalid range {rng}")
525
+ continue
526
+ start_p, end_p = rng
527
+ if start_p > end_p:
528
+ logger.warning(f"Skipping subtopic '{subname}' => start> end {rng}")
529
+ continue
530
+
531
+ # find occurrences
532
+ occs = find_all_occurrences(pdf_bytes, subname)
533
+ logger.info(f"Occurrences of subtopic '{subname}': {occs}")
534
+
535
+ doc_start_0 = start_p - 1
536
+ chosen_page = None
537
+ for p in occs:
538
+ if p >= doc_start_0:
539
+ chosen_page = p
540
+ break
541
+ if chosen_page is None:
542
+ # fallback to last or 0
543
+ if occs:
544
+ chosen_page = occs[-1]
545
+ logger.warning(f"No occurrence >= {doc_start_0} for '{subname}'. Using last => {chosen_page}")
546
+ else:
547
+ chosen_page = 0
548
+ logger.warning(f"No occurrences for '{subname}'. Using page 0.")
549
+
550
+ raw_offset = chosen_page - doc_start_0
551
+ offset = max(0, raw_offset)
552
+ logger.info(f"Subtopic '{subname}': doc_start={start_p}, chosen_page={chosen_page}, raw_offset={raw_offset}, offset={offset}")
553
+
554
+ s0 = (start_p - 1) + offset
555
+ e0 = (end_p - 1) + offset
556
+ s0 = max(0, min(total_pages - 1, s0))
557
+ e0 = max(0, min(total_pages - 1, e0))
558
+ for pp in range(s0, e0 + 1):
559
+ final_pages.add(pp)
560
+
561
+ # 3) If final_pages is empty => fallback entire PDF
562
+ if not final_pages:
563
+ logger.warning("No valid pages after offset. We'll process entire PDF.")
564
+ final_pages = set(range(total_pages))
565
+
566
+ logger.info(f"Processing pages (0-based): {sorted(final_pages)}")
567
+ subset_pdf_bytes = create_subset_pdf(pdf_bytes, sorted(final_pages))
568
+
569
+ # 4) doc_analyze => concurrency => final MD
570
  dataset = PymuDocDataset(subset_pdf_bytes)
571
  inference = doc_analyze(
572
  dataset,
 
578
  )
579
  logger.info("doc_analyze complete. Extracting images...")
580
 
581
+ writer = LocalImageWriter(self.output_folder, self.gemini_api_key)
582
+ pipe_result = inference.pipe_ocr_mode(writer, lang=self.language)
 
583
  md_content = pipe_result.get_markdown("local-unique-prefix/")
584
 
585
+ final_markdown = writer.post_process("local-unique-prefix/", md_content)
 
586
 
587
+ # 5) Save
588
+ out_path = os.path.join(self.output_folder, "final_output.md")
589
+ with open(out_path, "w", encoding="utf-8") as f:
590
  f.write(final_markdown)
591
 
592
+ logger.info(f"Markdown saved to: {out_path}")
593
  return final_markdown
594
 
595
  finally:
596
  self.cleanup_gpu()
597
 
598
+ ###############################################################################
599
+ # Example Main
600
+ ###############################################################################
 
 
 
 
 
 
 
 
 
 
 
 
 
 
601
  if __name__ == "__main__":
602
  input_pdf = "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf"
603
+ output_dir = "/home/user/app/output"
 
604
  gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
605
 
606
  try:
607
  processor = MineruNoTextProcessor(output_folder=output_dir, gemini_api_key=gemini_key)
608
+ md_output = processor.process(input_pdf)
609
  print("Final Markdown Output:")
610
+ print(md_output)
611
  except Exception as e:
612
  logger.error(f"Processing failed: {e}")