MinerU

Paused

App Files Files Community

SkyNait commited on Feb 26

Commit

77844f2

verified ·

1 Parent(s): 2a20a48

Update topic_extraction.py

Browse files

Files changed (1) hide show

topic_extraction.py +89 -31

topic_extraction.py CHANGED Viewed

@@ -4,28 +4,21 @@ import re
 import gc
 import json
 import logging
-import fitz  # PyMuPDF (pip install pymupdf)
 import base64
 import concurrent.futures
 from io import BytesIO
 from typing import List, Dict, Any
-# Attempt to import google.genai
-try:
-    from google import genai
-    from google.genai import types
-except ImportError:
-    genai = None
-    types = None
 import torch
 import cv2
-# Magic PDF pipeline
 from magic_pdf.data.dataset import PymuDocDataset
 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-# Your TableExtractor from topic_extraction_upgrade (or similar)
 from table_row_extraction import TableExtractor
 logging.basicConfig(level=logging.INFO)
@@ -87,19 +80,92 @@ class GeminiTopicExtractor:
         prompt = f"""
 You are given the text of a specification PDF.
-Identify the '2 Subject content and assessment information' topic.
-Under that topic, identify subtopics (like 'Paper 1 and Paper 2: Pure Mathematics', etc.)
-and their page ranges (1-based) from the text.
-Return JSON only, with structure:
-{{
-  "2 Subject content and assessment information": {{
-      "Paper 1 and Paper 2: Pure Mathematics": [start_page, end_page],
-      "Paper 3: Statistics and Mechanics": [start_page, end_page]
-  }}
-}}
-No extra explanation, just JSON.
-TEXT:
-{text_content}
         """
         try:
@@ -133,10 +199,6 @@ TEXT:
             logger.error(f"Could not open/read PDF: {e}")
         return "\n".join(text_parts)
-# -------------------------------------------------------------------
-# Gemini-based table classification (Mineru style)
-# -------------------------------------------------------------------
 def call_gemini_for_table_classification(image_data: bytes) -> str:
     if genai is None or types is None:
         logger.warning("Gemini not available. Returning NO_TABLE.")
@@ -482,10 +544,6 @@ class MineruNoTextProcessor:
                     pages.append(p)
         return pages
-# -------------------------------------------------------------------
-# Example usage
-# -------------------------------------------------------------------
 if __name__ == "__main__":
     input_pdf = "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf"
     output_dir = "/home/user/app/input_output/output"

 import gc
 import json
 import logging
+import fitz
 import base64
 import concurrent.futures
 from io import BytesIO
 from typing import List, Dict, Any
+from google import genai
+from google.genai import types
 import torch
 import cv2
 from magic_pdf.data.dataset import PymuDocDataset
 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
 from table_row_extraction import TableExtractor
 logging.basicConfig(level=logging.INFO)
         prompt = f"""
 You are given the text of a specification PDF.
+Instructions:
+                1. Identify the "Contents" section, which lists all topics, subtopics, and their corresponding pages.
+                2. Extract only the **highest-level, subject-related subtopics** (ignore organizational or administrative sections).
+                3. For subtopics, include the full range of pages from the first to the last subtopic.
+                4. Return the output in the following JSON format:
+                    {{
+                        "topic_name": [start_page, end_page]
+                    }}
+                Important Notes:
+                - Ignore non-subject-related sections (e.g., "Introduction", "Exam Guidelines", "Appendices", "Assessment, Qualification at a glance").
+                - The extracted subtopics should represent major academic areas, not organizational or structural elements.
+                - Make sure that all of the pages for a subtopic are included, end page should be the start page of the topic
+                  that comes next after the extracted one in contents section.
+                Examples:
+                1. Given this table of contents:
+                    1 Introduction – 2
+                        Why choose Edexcel A Level Mathematics? - 2
+                        Supporting you in planning and implementing this qualification - 3
+                        Qualification at a glance - 5
+                    2 Subject content and assessment information – 7
+                        Paper 1 and Paper 2: Pure Mathematics - 11
+                        Paper 3: Statistics and Mechanics - 30
+                        Assessment Objectives - 40
+                    3 Administration and general information – 42
+                        Entries - 42
+                        Access arrangements, reasonable adjustments, special consideration and malpractice - 42
+                        Student recruitment and progression - 45
+                    Appendix 1: Formulae – 49
+                    Appendix 2: Notation – 53
+                    Appendix 3: Use of calculators – 59
+                    Appendix 4: Assessment Objectives – 60
+                    Appendix 5: The context for the development of this qualification – 62
+                    Appendix 6: Transferable skills – 64
+                    Appendix 7: Level 3 Extended Project qualification – 65
+                    Appendix 8: Codes – 67
+                   The correct output should be:
+                    {{
+                        "Paper 1 and Paper 2: Pure Mathematics": [11, 29],
+                        "Paper 3: Statistics and Mechanics": [30, 42]
+                    }}
+                2. Given this table of contents:
+                    Qualification at a glance – 1
+                        Assessment Objectives and weightings - 4
+                    Knowledge, skills and understanding – 5
+                        Theme 1: Introduction to markets and market failure - 5
+                        Theme 2: The UK economy – performance and policies - 11
+                        Theme 3: Business behaviour and the labour market - 21
+                        Theme 4: A global perspective - 29
+                    Assessment – 39
+                        Assessment summary - 39
+                        Assessment objectives - 41
+                        Assessment overview - 42
+                        Breakdown of assessment objectives - 42
+                            Synoptic assessment - 43
+                            Discount code and performance tables - 43
+                            Access arrangements, reasonable adjustments and special consideration - 44
+                            Malpractice - 45
+                            Equality Act 2010 and Pearson equality policy - 45
+                            Synoptic assessment - 46
+                            Awarding and reporting - 47
+                    Other information – 49
+                        Student recruitment -49
+                        Prior learning and other requirements -49
+                        Progression - 49
+                    Appendix 1: Transferable skills – 53
+                    Appendix 2: Level 3 Extended Project qualification – 55
+                    Appendix 3: Quantitative skills – 59
+                    Appendix 4: Codes – 61
+                    Appendix 5: Index – 63
+                   The correct output should be:
+                    {{
+                        "Theme 1: Introduction to markets and market failure": [5, 10]
+                        "Theme 2: The UK economy – performance and policies": - [11, 20]
+                        "Theme 3: Business behaviour and the labour market": [21, 28]
+                        "Theme 4: A global perspective": [29, 38]
+                    }}
+                    Now, extract topics from this text:{text_content}
         """
         try:
             logger.error(f"Could not open/read PDF: {e}")
         return "\n".join(text_parts)
 def call_gemini_for_table_classification(image_data: bytes) -> str:
     if genai is None or types is None:
         logger.warning("Gemini not available. Returning NO_TABLE.")
                     pages.append(p)
         return pages
 if __name__ == "__main__":
     input_pdf = "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf"
     output_dir = "/home/user/app/input_output/output"