Spaces:

Duplicated from opendatalab/MinerU

Quextro
/

MinerU

Paused

App Files Files Community

SkyNait commited on Feb 27

Commit

b7d667b

·

1 Parent(s): b6c51c5

change the logic

Files changed (30) hide show

__pycache__/contents_extractor_v2.cpython-310.pyc +0 -0
__pycache__/mineru_test_local.cpython-310.pyc +0 -0
__pycache__/topic_extraction_upgrade.cpython-310.pyc +0 -0
input_output/outpu/images/img_1.png +0 -0
input_output/outpu/images/img_10.png +0 -0
input_output/outpu/images/img_11.png +0 -0
input_output/outpu/images/img_12.png +0 -0
input_output/outpu/images/img_13.png +0 -0
input_output/outpu/images/img_14.png +0 -0
input_output/outpu/images/img_15.png +0 -0
input_output/outpu/images/img_16.png +0 -0
input_output/outpu/images/img_17.png +0 -0
input_output/outpu/images/img_18.png +0 -0
input_output/outpu/images/img_19.png +0 -0
input_output/outpu/images/img_2.png +0 -0
input_output/outpu/images/img_20.png +0 -0
input_output/outpu/images/img_21.png +0 -0
input_output/outpu/images/img_22.png +0 -0
input_output/outpu/images/img_23.png +0 -0
input_output/outpu/images/img_24.png +0 -0
input_output/outpu/images/img_25.png +0 -0
input_output/outpu/images/img_26.png +0 -0
input_output/outpu/images/img_3.png +0 -0
input_output/outpu/images/img_4.png +0 -0
input_output/outpu/images/img_5.png +0 -0
input_output/outpu/images/img_6.png +0 -0
input_output/outpu/images/img_7.png +0 -0
input_output/outpu/images/img_8.png +0 -0
input_output/outpu/images/img_9.png +0 -0
topic_extr.py +64 -156

__pycache__/contents_extractor_v2.cpython-310.pyc DELETED Viewed

Binary file (7 kB)

__pycache__/mineru_test_local.cpython-310.pyc DELETED Viewed

Binary file (11.9 kB)

__pycache__/topic_extraction_upgrade.cpython-310.pyc DELETED Viewed

Binary file (10.9 kB)

input_output/outpu/images/img_1.png ADDED Viewed

input_output/outpu/images/img_10.png ADDED Viewed

input_output/outpu/images/img_11.png ADDED Viewed

input_output/outpu/images/img_12.png ADDED Viewed

input_output/outpu/images/img_13.png ADDED Viewed

input_output/outpu/images/img_14.png ADDED Viewed

input_output/outpu/images/img_15.png ADDED Viewed

input_output/outpu/images/img_16.png ADDED Viewed

input_output/outpu/images/img_17.png ADDED Viewed

input_output/outpu/images/img_18.png ADDED Viewed

input_output/outpu/images/img_19.png ADDED Viewed

input_output/outpu/images/img_2.png ADDED Viewed

input_output/outpu/images/img_20.png ADDED Viewed

input_output/outpu/images/img_21.png ADDED Viewed

input_output/outpu/images/img_22.png ADDED Viewed

input_output/outpu/images/img_23.png ADDED Viewed

input_output/outpu/images/img_24.png ADDED Viewed

input_output/outpu/images/img_25.png ADDED Viewed

input_output/outpu/images/img_26.png ADDED Viewed

input_output/outpu/images/img_3.png ADDED Viewed

input_output/outpu/images/img_4.png ADDED Viewed

input_output/outpu/images/img_5.png ADDED Viewed

input_output/outpu/images/img_6.png ADDED Viewed

input_output/outpu/images/img_7.png ADDED Viewed

input_output/outpu/images/img_8.png ADDED Viewed

input_output/outpu/images/img_9.png ADDED Viewed

topic_extr.py CHANGED Viewed

@@ -10,34 +10,23 @@ import cv2
 import numpy as np
 from io import BytesIO
 from typing import List, Dict, Any
 import torch
-# Try to import google.genai
-try:
-    from google import genai
-    from google.genai import types
-except ImportError:
-    genai = None
-    types = None
-# magic-pdf imports
 from magic_pdf.data.dataset import PymuDocDataset
 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-# table extraction logic
 from table_row_extraction import TableExtractor
-###############################################################################
-# Logging Setup
-###############################################################################
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
-###############################################################################
-# PDF Subset Creation
-###############################################################################
 def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> bytes:
     """
     Creates a new PDF (in memory) containing only the pages in page_indices (0-based).
@@ -59,20 +48,16 @@ def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> byt
     doc.close()
     return subset_bytes
-###############################################################################
-# Utility: Shrink Images Before Sending to Gemini
-###############################################################################
 def shrink_image_to_jpeg(image_data: bytes, max_dim: int = 800, jpeg_quality: int = 80) -> bytes:
     """
     Decode image_data, resize so largest dimension <= max_dim, then re-encode as JPEG.
     This reduces request size to Gemini significantly.
     """
     try:
-        # Decode
         arr = np.frombuffer(image_data, np.uint8)
         img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
         if img is None:
-            # Not a valid image, return as is
             return image_data
         h, w, _ = img.shape
@@ -84,7 +69,6 @@ def shrink_image_to_jpeg(image_data: bytes, max_dim: int = 800, jpeg_quality: in
             new_h = int(h * scale)
             img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
-        # Re-encode
         encode_params = [int(cv2.IMWRITE_JPEG_QUALITY), jpeg_quality]
         success, enc = cv2.imencode(".jpg", img, encode_params)
         if success:
@@ -96,15 +80,12 @@ def shrink_image_to_jpeg(image_data: bytes, max_dim: int = 800, jpeg_quality: in
         logger.warning(f"shrink_image_to_jpeg error: {e}. Returning original data.")
         return image_data
-###############################################################################
-# Gemini LLM - Subtopic Extraction
-###############################################################################
 class GeminiTopicExtractor:
     """
     Reads the first few pages of a PDF to get the table of contents text,
     then uses Gemini to parse out topics -> [start_page, end_page].
     """
-    def __init__(self, api_key: str = None, num_pages: int = 14):
         self.api_key = api_key or os.getenv("GEMINI_API_KEY", "")
         if not self.api_key:
             logger.warning("No Gemini API key provided for subtopic extraction.")
@@ -117,100 +98,32 @@ class GeminiTopicExtractor:
             return {}
         if genai is None or types is None:
-            logger.warning("google.genai is not installed. Returning empty subtopics.")
             return {}
         prompt = f"""
 You will be provided with the first pages of an exam board document.
-Your goal is to extract the main subject-related topics from the "Contents" section and structure them in a valid JSON format.Instructions:
-    Instructions:
     1. Identify the 'Contents' section listing all topics, subtopics, and their corresponding pages.
     2. Extract only the **highest-level, subject-related subtopics** (ignore administrative sections).
     3. For each subtopic, return [start_page, end_page] (1-based).
     4. Output valid JSON in the following format:
-        {{
-        "Topic A": [start_page, end_page],
-        "Topic B": [start_page, end_page]
-        }}
 Important Notes:
-- Ignore non-subject-related sections (e.g., "Introduction", "Exam Guidelines", "Appendices", "Assessment, Qualification at a glance").
 - The extracted subtopics should represent major academic areas, not organizational or structural elements.
-- Make sure that all of the pages for a subtopic are included, end page should be the -1 start page of the topic
-    that comes next after the extracted one in contents section.
-Examples:
-1. Given this table of contents:
-    1 Introduction – 2
-        Why choose Edexcel A Level Mathematics? - 2
-        Supporting you in planning and implementing this qualification - 3
-        Qualification at a glance - 5
-    2 Subject content and assessment information – 7
-        Paper 1 and Paper 2: Pure Mathematics - 11
-        Paper 3: Statistics and Mechanics - 30
-        Assessment Objectives - 40
-    3 Administration and general information – 42
-        Entries - 42
-        Access arrangements, reasonable adjustments, special consideration and malpractice - 42
-        Student recruitment and progression - 45
-    Appendix 1: Formulae – 49
-    Appendix 2: Notation – 53
-    Appendix 3: Use of calculators – 59
-    Appendix 4: Assessment Objectives – 60
-    Appendix 5: The context for the development of this qualification – 62
-    Appendix 6: Transferable skills – 64
-    Appendix 7: Level 3 Extended Project qualification – 65
-    Appendix 8: Codes – 67
-    The correct output should be:
-    {{
-        "Paper 1 and Paper 2: Pure Mathematics": [11, 29],
-        "Paper 3: Statistics and Mechanics": [30, 42]
-    }}
-2. Given this table of contents:
-    Qualification at a glance – 1
-        Assessment Objectives and weightings - 4
-    Knowledge, skills and understanding – 5
-        Theme 1: Introduction to markets and market failure - 5
-        Theme 2: The UK economy – performance and policies - 11
-        Theme 3: Business behaviour and the labour market - 21
-        Theme 4: A global perspective - 29
-    Assessment – 39
-        Assessment summary - 39
-        Assessment objectives - 41
-        Assessment overview - 42
-        Breakdown of assessment objectives - 42
-            Synoptic assessment - 43
-            Discount code and performance tables - 43
-            Access arrangements, reasonable adjustments and special consideration - 44
-            Malpractice - 45
-            Equality Act 2010 and Pearson equality policy - 45
-            Synoptic assessment - 46
-            Awarding and reporting - 47
-    Other information – 49
-        Student recruitment -49
-        Prior learning and other requirements -49
-        Progression - 49
-    Appendix 1: Transferable skills – 53
-    Appendix 2: Level 3 Extended Project qualification – 55
-    Appendix 3: Quantitative skills – 59
-    Appendix 4: Codes – 61
-    Appendix 5: Index – 63
-    The correct output should be:
-    {{
-        "Theme 1: Introduction to markets and market failure": [5, 10]
-        "Theme 2: The UK economy – performance and policies": - [11, 20]
-        "Theme 3: Business behaviour and the labour market": [21, 28]
-        "Theme 4: A global perspective": [29, 38]
-    }}
-    Now, extract topics from this text: {text_content}
 """
         try:
@@ -246,9 +159,6 @@ Examples:
             logger.error(f"Could not open/read PDF: {e}")
         return "\n".join(text_parts)
-###############################################################################
-# Gemini-based Image Classification
-###############################################################################
 def call_gemini_for_table_classification(image_data: bytes, api_key: str) -> str:
     """
     Classify an image as TWO_COLUMN, THREE_COLUMN, or NO_TABLE using Gemini (Flash).
@@ -261,7 +171,6 @@ def call_gemini_for_table_classification(image_data: bytes, api_key: str) -> str
         logger.warning("google.genai not installed, returning NO_TABLE.")
         return "NO_TABLE"
-    # Shrink image
     shrunk_data = shrink_image_to_jpeg(image_data, max_dim=800, jpeg_quality=80)
     prompt = """You are given an image. Determine if it shows a table that has exactly 2 or 3 columns.
@@ -285,14 +194,6 @@ THREE_COLUMN
 NO_TABLE
 """
     try:
-        # Example of optional manual timeout approach (commented out):
-        # import signal
-        # def handler(signum, frame):
-        #     raise TimeoutError("Table classification timed out!")
-        # signal.signal(signal.SIGALRM, handler)
-        # signal.alarm(30)  # 30s timeout
-        logger.debug("Sending image to Gemini for table classification...")
         client = genai.Client(api_key=api_key)
         response = client.models.generate_content(
             model="gemini-2.0-flash",
@@ -311,8 +212,6 @@ NO_TABLE
             ],
             config=types.GenerateContentConfig(temperature=0.0)
         )
-        # signal.alarm(0)  # cancel timeout
         if response and response.text:
             logger.info(f"[Gemini table classification] LLM raw response:\n{response.text}")
@@ -357,7 +256,6 @@ If the image is of a multiple-choice question’s options, then modify your answ
 Otherwise, follow the above instructions strictly.
 """
     try:
-        logger.debug("Sending image to Gemini for description...")
         client = genai.Client(api_key=api_key)
         response = client.models.generate_content(
             model="gemini-2.0-flash",
@@ -384,14 +282,11 @@ Otherwise, follow the above instructions strictly.
         logger.error(f"Gemini image description error: {e}")
         return "Image description unavailable"
-###############################################################################
-# Local Image Writer (Sequential Gemini Calls)
-###############################################################################
 class LocalImageWriter:
     """
-    Saves extracted images, classifies them with Gemini for table/no-table,
-    describes them if no-table, then modifies the Markdown to replace
-    the original references with final alt text. Also processes table images
     into row/column cell images.
     """
     def __init__(self, output_folder: str, gemini_api_key: str):
@@ -427,24 +322,46 @@ class LocalImageWriter:
     def post_process(self, key: str, md_content: str) -> str:
         """
-        1) Classify images as table/no-table (sequential).
-        2) Describe non-table images (sequential).
         3) Replace placeholders in the Markdown with final alt text.
         4) Process table images => row/col cell images => update Markdown.
         5) Keep only image-reference lines in the final Markdown.
         """
-        # 1) Table classification
-        logger.info("Classifying images to detect tables (sequential)...")
-        for p, info in self.descriptions.items():
-            classification = call_gemini_for_table_classification(info["data"], self.gemini_api_key)
-            self.descriptions[p]['table_classification'] = classification
-        # 2) Image description for non-table
-        logger.info("Generating image descriptions for non-table images (sequential)...")
-        for p, info in self.descriptions.items():
-            if info['table_classification'] == "NO_TABLE":
-                desc = call_gemini_for_image_description(info["data"], self.gemini_api_key)
-                info['final_alt'] = desc
         # For images classified as 2/3-column tables => set alt
         for p, info in self.descriptions.items():
@@ -525,16 +442,13 @@ class LocalImageWriter:
         return md_content
-###############################################################################
-# Mineru (magic-pdf) Pipeline with Page-Range Preprocessing
-###############################################################################
 class MineruNoTextProcessor:
     """
     1) Extracts page ranges from the PDF's table of contents (via Gemini).
     2) Creates a subset PDF in memory for those pages.
     3) Runs magic-pdf analysis on the subset PDF.
     4) Generates a Markdown file with images, including table images
-       split into row/column cells.
     """
     def __init__(self, output_folder: str, gemini_api_key: str = None):
         self.output_folder = output_folder
@@ -546,7 +460,7 @@ class MineruNoTextProcessor:
         self.table_enable = False
         self.language = "en"
-        self.subtopic_extractor = GeminiTopicExtractor(api_key=gemini_api_key, num_pages=4)
         self.gemini_api_key = gemini_api_key or os.getenv("GEMINI_API_KEY", "")
     def cleanup_gpu(self):
@@ -611,7 +525,7 @@ class MineruNoTextProcessor:
             pipe_result = inference.pipe_ocr_mode(image_writer, lang=self.language)
             md_content = pipe_result.get_markdown("local-unique-prefix/")
-            # 7) Post-process => classify table images => final MD
             final_markdown = image_writer.post_process("local-unique-prefix/", md_content)
             # 8) Save final Markdown
@@ -642,17 +556,11 @@ class MineruNoTextProcessor:
                 logger.warning(f"Skipping topic '{topic}' with invalid range: {rng}")
         return pages
-###############################################################################
-# Main Execution
-###############################################################################
 if __name__ == "__main__":
-    # Example usage:
     input_pdf = "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf"
-    output_dir = "/home/user/app/input_output/output"
-    # Provide your Gemini API key (or rely on GEMINI_API_KEY env var).
     gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
-    # gemini_key = "YOUR_GEMINI_API_KEY"
     try:
         processor = MineruNoTextProcessor(output_folder=output_dir, gemini_api_key=gemini_key)

 import numpy as np
 from io import BytesIO
 from typing import List, Dict, Any
+import concurrent.futures
 import torch
+from google import genai
+from google.genai import types
 from magic_pdf.data.dataset import PymuDocDataset
 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
 from table_row_extraction import TableExtractor
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> bytes:
     """
     Creates a new PDF (in memory) containing only the pages in page_indices (0-based).
     doc.close()
     return subset_bytes
 def shrink_image_to_jpeg(image_data: bytes, max_dim: int = 800, jpeg_quality: int = 80) -> bytes:
     """
     Decode image_data, resize so largest dimension <= max_dim, then re-encode as JPEG.
     This reduces request size to Gemini significantly.
     """
     try:
         arr = np.frombuffer(image_data, np.uint8)
         img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
         if img is None:
+            # Not a valid image, return as-is
             return image_data
         h, w, _ = img.shape
             new_h = int(h * scale)
             img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
         encode_params = [int(cv2.IMWRITE_JPEG_QUALITY), jpeg_quality]
         success, enc = cv2.imencode(".jpg", img, encode_params)
         if success:
         logger.warning(f"shrink_image_to_jpeg error: {e}. Returning original data.")
         return image_data
 class GeminiTopicExtractor:
     """
     Reads the first few pages of a PDF to get the table of contents text,
     then uses Gemini to parse out topics -> [start_page, end_page].
     """
+    def __init__(self, api_key: str = None, num_pages: int = 15):
         self.api_key = api_key or os.getenv("GEMINI_API_KEY", "")
         if not self.api_key:
             logger.warning("No Gemini API key provided for subtopic extraction.")
             return {}
         if genai is None or types is None:
+            logger.warning("google.genai not installed. Returning empty subtopics.")
             return {}
         prompt = f"""
 You will be provided with the first pages of an exam board document.
+Your goal is to extract the main subject-related topics from the \"Contents\" section
+and structure them in a valid JSON format.
+Instructions:
     1. Identify the 'Contents' section listing all topics, subtopics, and their corresponding pages.
     2. Extract only the **highest-level, subject-related subtopics** (ignore administrative sections).
     3. For each subtopic, return [start_page, end_page] (1-based).
     4. Output valid JSON in the following format:
+       {{
+         "Topic A": [start_page, end_page],
+         "Topic B": [start_page, end_page]
+       }}
 Important Notes:
+- Ignore non-subject-related sections (e.g., 'Introduction', 'Exam Guidelines', 'Appendices',
+  'Assessment, Qualification at a glance').
 - The extracted subtopics should represent major academic areas, not organizational or structural elements.
+- Ignore including the main topic page as start, ONLY subtopic first page.
+- Make sure that all of the pages for a subtopic are included; the end page should be (the start page of the
+  next topic) - 1.
+Now, extract topics from this text: {text_content}
 """
         try:
             logger.error(f"Could not open/read PDF: {e}")
         return "\n".join(text_parts)
 def call_gemini_for_table_classification(image_data: bytes, api_key: str) -> str:
     """
     Classify an image as TWO_COLUMN, THREE_COLUMN, or NO_TABLE using Gemini (Flash).
         logger.warning("google.genai not installed, returning NO_TABLE.")
         return "NO_TABLE"
     shrunk_data = shrink_image_to_jpeg(image_data, max_dim=800, jpeg_quality=80)
     prompt = """You are given an image. Determine if it shows a table that has exactly 2 or 3 columns.
 NO_TABLE
 """
     try:
         client = genai.Client(api_key=api_key)
         response = client.models.generate_content(
             model="gemini-2.0-flash",
             ],
             config=types.GenerateContentConfig(temperature=0.0)
         )
         if response and response.text:
             logger.info(f"[Gemini table classification] LLM raw response:\n{response.text}")
 Otherwise, follow the above instructions strictly.
 """
     try:
         client = genai.Client(api_key=api_key)
         response = client.models.generate_content(
             model="gemini-2.0-flash",
         logger.error(f"Gemini image description error: {e}")
         return "Image description unavailable"
 class LocalImageWriter:
     """
+    Saves extracted images, then does concurrent Gemini classification
+    and description calls. Finally modifies the Markdown to replace
+    references with final alt text. Also processes table images
     into row/column cell images.
     """
     def __init__(self, output_folder: str, gemini_api_key: str):
     def post_process(self, key: str, md_content: str) -> str:
         """
+        1) Table classification calls (concurrent).
+        2) Image description calls for non-table images (concurrent).
         3) Replace placeholders in the Markdown with final alt text.
         4) Process table images => row/col cell images => update Markdown.
         5) Keep only image-reference lines in the final Markdown.
         """
+        # 1) Table classification (CONCURRENT)
+        logger.info("Classifying images to detect tables (concurrent)...")
+        with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
+            future_map = {}
+            for p, info in self.descriptions.items():
+                fut = executor.submit(call_gemini_for_table_classification, info["data"], self.gemini_api_key)
+                future_map[fut] = p
+            for fut in concurrent.futures.as_completed(future_map):
+                path = future_map[fut]
+                try:
+                    classification = fut.result()
+                    self.descriptions[path]['table_classification'] = classification
+                except Exception as e:
+                    logger.error(f"Error classifying table for image {path}: {e}")
+                    self.descriptions[path]['table_classification'] = "NO_TABLE"
+        # 2) Image description (CONCURRENT), only for NO_TABLE images
+        logger.info("Generating image descriptions for non-table images (concurrent)...")
+        with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
+            future_map_desc = {}
+            for p, info in self.descriptions.items():
+                if info['table_classification'] == "NO_TABLE":
+                    fut = executor.submit(call_gemini_for_image_description, info["data"], self.gemini_api_key)
+                    future_map_desc[fut] = p
+            for fut in concurrent.futures.as_completed(future_map_desc):
+                path = future_map_desc[fut]
+                try:
+                    desc = fut.result()
+                    self.descriptions[path]['final_alt'] = desc
+                except Exception as e:
+                    logger.error(f"Error describing image {path}: {e}")
+                    self.descriptions[path]['final_alt'] = "Image description unavailable"
         # For images classified as 2/3-column tables => set alt
         for p, info in self.descriptions.items():
         return md_content
 class MineruNoTextProcessor:
     """
     1) Extracts page ranges from the PDF's table of contents (via Gemini).
     2) Creates a subset PDF in memory for those pages.
     3) Runs magic-pdf analysis on the subset PDF.
     4) Generates a Markdown file with images, including table images
+       split into row/column cells, with concurrency for Gemini calls.
     """
     def __init__(self, output_folder: str, gemini_api_key: str = None):
         self.output_folder = output_folder
         self.table_enable = False
         self.language = "en"
+        self.subtopic_extractor = GeminiTopicExtractor(api_key=gemini_api_key, num_pages=15)
         self.gemini_api_key = gemini_api_key or os.getenv("GEMINI_API_KEY", "")
     def cleanup_gpu(self):
             pipe_result = inference.pipe_ocr_mode(image_writer, lang=self.language)
             md_content = pipe_result.get_markdown("local-unique-prefix/")
+            # 7) Post-process => concurrent table classification / description => final MD
             final_markdown = image_writer.post_process("local-unique-prefix/", md_content)
             # 8) Save final Markdown
                 logger.warning(f"Skipping topic '{topic}' with invalid range: {rng}")
         return pages
 if __name__ == "__main__":
     input_pdf = "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf"
+    output_dir = "/home/user/app/input_output/outpu"
     gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
     try:
         processor = MineruNoTextProcessor(output_folder=output_dir, gemini_api_key=gemini_key)