MinerU

Paused

App Files Files Community

SkyNait commited on Mar 5

Commit

5e4a36e

1 Parent(s): 9351a05

correct page array handilng

Browse files

Files changed (3) hide show

output/sample_spec_output.md +63 -0
page_range.py +100 -58
topic_extr.py +1 -4

output/sample_spec_output.md ADDED Viewed

	@@ -0,0 +1,63 @@

+# Paper 1 and Paper 2: Pure Mathematics
+To support the co-teaching of this qualification with the AS Mathematics qualification, common content has been highlighted in bold..
+![](/topic-extraction/8ad59648516f3e9564f0e5df8114f87cd48c2fe5f34b15c28c704962b31adc70.jpg)
+![](/topic-extraction/8116200eb839fa0c6d87bb6e96db29559283cc3d7de7ff3834326012ca2d37e3.jpg)
+![](/topic-extraction/4cc2bdaa64251411d29493fafb406ad9974260459b247be498e312e29b969a15.jpg)
+![](/topic-extraction/a06c5ac3695ab4caff0dc2724c9a8a288fefc94cb1b79e370975be31d3869230.jpg)
+![](/topic-extraction/c5631064f99712df9f9591a603ae00098be039f661264ce67925a18c90e06142.jpg)
+![](/topic-extraction/88fdba19faed0f761e041fbf040b8cfc57c73bdf36fcd6e32f59f09ac91aeab6.jpg)
+![](/topic-extraction/a9a669c1c64b92583f2cc72a8216854bb76c90586a0615afdaee9f0d26d120e9.jpg)
+![](/topic-extraction/6b5c9d3211ba2d7c95de68ed81f03fc32f1aba22d55e8ba53fb4586fdb270426.jpg)
+![](/topic-extraction/310a9b6f2764de2b165de3343fea3e64ddbd36f8d43c5962dd48730a9e729019.jpg)
+![](/topic-extraction/4d3fa5997973de85edbf15b31c91f5d1822c5d9698cbb953d1cd9fff04fca369.jpg)
+![](/topic-extraction/480483c021c62d2499f240729e15a6aae16aa6d3be9aec2c65a16e6dd6b878e5.jpg)
+![](/topic-extraction/4b2d26dfff554e5c0e2e33968ea3fbae882e9deec5aa2607288ac72f05fbc093.jpg)
+![](/topic-extraction/de35db590f61b05cf88744cd89789d664a6abb48c94ef6fb2f380404e0b6aa56.jpg)
+![](/topic-extraction/230e72098ba7930d8338b8c0bc7c184e7129ec59141952e7c57f127655a00164.jpg)
+![](/topic-extraction/0247eaaab6c95cbc124fa87c44936e2d9963699fd3bc7522596f997029426354.jpg)
+![](/topic-extraction/e95d6f913ef911a562b5c5c0e336cf6265c90753738fbf1fb5b86a0370573286.jpg)
+![](/topic-extraction/e75eb0c3ddebeb5cdef32f0a4281f98c0f435792630fd5cf2a60827fed6496ae.jpg)
+![](/topic-extraction/6db71c2167c71b32503e4025534a9111558ee893c9b94335f73e8d965bdb3e7f.jpg)
+# Paper 3: Statistics and Mechanics
+All the Pure Mathematics content is assumed knowledge for Paper 3 and may be tested in parts of questions.
+To support the co-teaching of this qualification with the AS Mathematics qualification, common content has been highlighted in bold..
+![](/topic-extraction/bdc8dba766b71c8baa1fa78425fa9b05960de72fa2e3cd58acec0ed9f6a38484.jpg)
+![](/topic-extraction/8a7e0f0815ec510978f1e4629f452be0f698ae3b2b73fdd0c6cb6d01b73c658d.jpg)
+![](/topic-extraction/c0f6c78a4393655d252cf16cf91690f5b853c925eef73e15ce9473f6039518e8.jpg)
+![](/topic-extraction/d8fc74d90978852def7740a09c94949a8b30a37248555561f4997f4d40bad7b1.jpg)
+![](/topic-extraction/c27edd49d1ff81e5e31321b53fc559bac988181af672cda7fe65fb17e48fd674.jpg)
+![](/topic-extraction/f82f21d337bc60d0dc797db76b5738144904989fb044160d9fcceaa41651aa33.jpg)
+![](/topic-extraction/74059e4d980d876dec0451f14e791402349da955dda7308450dccc287bed0147.jpg)
+![](/topic-extraction/263c0b8a692bad208c16544fd15d1b12c10dae66e88f3067e4c34932af7eebc4.jpg)
+![](/topic-extraction/80919764b501319dc4a0fd6715bd31192ad14c7090ed0aed89eabef833b7622e.jpg)

page_range.py CHANGED Viewed

@@ -5,7 +5,9 @@ import json
 import logging
 import fitz
 import requests
 from statistics import mode, median
 from google import genai
 from google.genai import types
@@ -62,7 +64,7 @@ You have the first pages of a PDF specification, including a table of contents.
 Instructions:
 1. Identify the 'Contents' section listing all topics, subtopics, and their corresponding pages.
 2. Identify the major academic subtopics (common desired topic names "Paper X", "Theme X", "Content of X", "AS Unit X", "A2 Unit X", or similar headings).
-3. For each subtopic, give the range of pages [start_page, end_page -1] (1-based) from the table of contents.
 4. Output only valid JSON of the form:
     {{
     "Subtopic A": [start_page, end_page],
@@ -72,6 +74,7 @@ Instructions:
 Important notes:
 - The correct "end_page" must be the page number of the next topic or subtopic minus 1.
 - The final output must be valid JSON only, with no extra text or code blocks.
 Examples:
 1. Given this table of contents:
 1 Introduction – 2
@@ -104,6 +107,7 @@ Assessment – 39
     Assessment summary - 39
     Assessment objectives - 41
     Assessment overview - 42
 The correct output should be:
 {{
@@ -112,6 +116,57 @@ The correct output should be:
     "Theme 3: Business behaviour and the labour market": [21, 28],
     "Theme 4: A global perspective": [29, 38]
 }}
 Now, extract topics from this text:
 {first_pages_text}
 """
@@ -180,79 +235,66 @@ class TopicRangeExtractor:
         total_pages = doc.page_count
         doc.close()
-        # Compute global offset and adjust subtopic ranges.
         if not subtopics:
-            global_offset = 0
-            subtopics_corrected = {}
-        else:
-            offset_candidates = []
-            subtopics_corrected = {}
-            for subname, rng in subtopics.items():
-                if not (isinstance(rng, list) and len(rng) == 2):
-                    continue
-                start_p, end_p = rng
-                occs = find_all_occurrences(pdf_bytes, subname)
-                for p in occs:
-                    candidate = p - (start_p - 1)
-                    if candidate > 0:
-                        offset_candidates.append(candidate)
-                subtopics_corrected[subname] = rng
-            if offset_candidates:
-                try:
-                    global_offset = mode(offset_candidates)
-                except Exception:
-                    global_offset = int(median(offset_candidates))
-            else:
-                global_offset = 0
-            logger.info(f"Computed global offset: {global_offset}")
-        # Adjust ranges by applying the global offset.
-        adjusted_topics = {}
         for subname, rng in subtopics_corrected.items():
             start_p, end_p = rng
-            s0 = (start_p - 1) + global_offset
             e0 = (end_p - 1) + global_offset
-            adjusted_topics[subname] = [s0, e0]
-        # Sort the topics by their adjusted start page.
-        sorted_topics = sorted(adjusted_topics.items(), key=lambda item: item[1][0])
-        effective_ranges = {}
-        # For each subtopic, if there is a next one, set its effective end to the next topic's start minus 1.
-        for i, (name, (start, end)) in enumerate(sorted_topics):
-            if i < len(sorted_topics) - 1:
-                next_start = sorted_topics[i+1][1][0]
-                effective_end = min(end, next_start - 1)
             else:
-                effective_end = end
-            effective_ranges[name] = [start, effective_end]
-        # Build the union of pages from each effective range.
-        # For every topic except the last, use a half-open range to skip the boundary page.
         real_pages_set = set()
-        for i, (name, (start, end)) in enumerate(sorted_topics):
-            if i < len(sorted_topics) - 1:
-                # End is exclusive so the boundary page (end) is skipped.
-                for pp in range(start, end):
-                    if 0 <= pp < total_pages:
-                        real_pages_set.add(pp)
-            else:
-                # For the last topic include the end page.
-                for pp in range(start, end + 1):
-                    if 0 <= pp < total_pages:
-                        real_pages_set.add(pp)
-        page_range = sorted(real_pages_set)
-        return {
-            "page_range": page_range
-        }
 if __name__ == "__main__":
-    input_pdf = "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf"
     gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
     try:
         extractor = TopicRangeExtractor(gemini_api_key=gemini_key)
         result = extractor.process(input_pdf)
-        print(json.dumps(result, indent=2))
     except Exception as e:
         logger.error(f"Processing failed: {e}")

 import logging
 import fitz
 import requests
+import time
 from statistics import mode, median
+from typing import Dict, List, Tuple
 from google import genai
 from google.genai import types
 Instructions:
 1. Identify the 'Contents' section listing all topics, subtopics, and their corresponding pages.
 2. Identify the major academic subtopics (common desired topic names "Paper X", "Theme X", "Content of X", "AS Unit X", "A2 Unit X", or similar headings).
+3. For each subtopic, give the range of pages [start_page, end_page] (1-based) from the table of contents.
 4. Output only valid JSON of the form:
     {{
     "Subtopic A": [start_page, end_page],
 Important notes:
 - The correct "end_page" must be the page number of the next topic or subtopic minus 1.
 - The final output must be valid JSON only, with no extra text or code blocks.
 Examples:
 1. Given this table of contents:
 1 Introduction – 2
     Assessment summary - 39
     Assessment objectives - 41
     Assessment overview - 42
+    Breakdown of assessment objectives - 42
 The correct output should be:
 {{
     "Theme 3: Business behaviour and the labour market": [21, 28],
     "Theme 4: A global perspective": [29, 38]
 }}
+3. You might also see sections like:
+2.1 AS Unit 1 11
+2.2 AS Unit 2 18
+2.3 A2 Unit 3 24
+2.4 A2 Unit 4 31
+In that scenario, your output might look like:
+{{
+    "2.1 AS Unit 1": [11, 17],
+    "2.2 AS Unit 2": [18, 23],
+    "2.3 A2 Unit 3": [24, 30],
+    "2.4 A2 Unit 4": [31, 35]
+}}
+or
+2.1 AS units 6
+2.2 AS units 23
+In that scenario, your output might look like:
+{{
+    "2.1 AS Unit 1": [6, 2],
+    "2.2 AS Unit 2": [23, 43]
+}}
+4. Another example might list subtopics:
+3.1 Overarching themes 11
+3.2 A: Proof 12
+3.3 B: Algebra and functions 13
+3.4 C: Coordinate geometry in the ( x , y ) plane 14
+3.5 D: Sequences and series 15
+3.6 E: Trigonometry 16
+3.7 F: Exponentials and logarithms 17
+3.8 G: Differentiation 18
+3.9 H: Integration 19
+3.10 I: Numerical methods 20
+3.11 J: Vectors 20
+3.12 K: Statistical sampling 21
+3.13 L: Data presentation and interpretation 21
+3.14 M: Probability 22
+3.15 N: Statistical distributions 23
+3.16 O: Statistical hypothesis testing 23
+3.17 P: Quantities and units in mechanics 24
+3.18 Q: Kinematics 24
+3.19 R: Forces and Newton’s laws 24
+3.20 S: Moments 25
+3.21 Use of data in statistics 26
+Here the correct output might look like:
+{{
+    "A: Proof": [12, 12],
+    "B: Algebra and functions": [13, 13],
+    ...
+}}
 Now, extract topics from this text:
 {first_pages_text}
 """
         total_pages = doc.page_count
         doc.close()
         if not subtopics:
+            return {"page_range": list(range(total_pages))}
+        offset_candidates = []
+        subtopics_corrected = {}
+        for subname, rng in subtopics.items():
+            if not (isinstance(rng, list) and len(rng) == 2):
+                continue
+            start_p, end_p = rng
+            occs = find_all_occurrences(pdf_bytes, subname)
+            for p in occs:
+                candidate = p - (start_p - 1)
+                if candidate > 0:
+                    offset_candidates.append(candidate)
+            subtopics_corrected[subname] = rng
+        if offset_candidates:
+            try:
+                global_offset = mode(offset_candidates)
+            except Exception:
+                global_offset = int(median(offset_candidates))
+        else:
+            global_offset = 0
+        logger.info(f"Computed global offset: {global_offset}")
+        adjusted_subtopics = []
         for subname, rng in subtopics_corrected.items():
             start_p, end_p = rng
+            s0 = (start_p) + global_offset
             e0 = (end_p - 1) + global_offset
+            adjusted_subtopics.append((subname, (s0, e0)))
+        sorted_subtopics = sorted(adjusted_subtopics, key=lambda x: x[1][0])
+        final_subtopics = []
+        for i in range(len(sorted_subtopics)):
+            subname, (s0, e0) = sorted_subtopics[i]
+            if i < len(sorted_subtopics) - 1:
+                next_s0 = sorted_subtopics[i + 1][1][0]
+                new_e0 = min(e0, next_s0 - 1)
             else:
+                new_e0 = min(e0, total_pages - 1)
+            final_subtopics.append((subname, (s0, new_e0)))
         real_pages_set = set()
+        for subname, (s0, e0) in final_subtopics:
+            for pp in range(s0, e0 + 1):
+                if 0 <= pp < total_pages:
+                    real_pages_set.add(pp)
+        page_range = sorted(real_pages_set)
+        logger.info(f"Final page range: {page_range}")
+        return {"page_range": page_range}
 if __name__ == "__main__":
+    input_pdf = "/home/user/app/input_output/pearson-A_Level_Economics.pdf"
     gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
     try:
         extractor = TopicRangeExtractor(gemini_api_key=gemini_key)
         result = extractor.process(input_pdf)
+        # print(json.dumps(result, indent=2))
     except Exception as e:
         logger.error(f"Processing failed: {e}")

topic_extr.py CHANGED Viewed

@@ -184,10 +184,7 @@ def main():
                     "key": "sample_spec",
                     "url": "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf",
                     "type": "specification",
-                    "page": [
-                        15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
-                        28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39, 40, 41
-                    ]
                 }
             ],
             "topics": [

                     "key": "sample_spec",
                     "url": "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf",
                     "type": "specification",
+                    "page":  [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39, 40, 41, 42]
                 }
             ],
             "topics": [