Correct page range handling
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- __pycache__/inference_svm_model.cpython-310.pyc +0 -0
- __pycache__/mineru_single.cpython-310.pyc +0 -0
- __pycache__/table_row_extraction.cpython-310.pyc +0 -0
- __pycache__/worker.cpython-310.pyc +0 -0
- contents_extractor_v2.py +0 -136
- input_output/ocr-specification-economics.pdf +3 -0
- input_output/outpu/images/img_1.png +0 -0
- input_output/outpu/images/img_10.png +0 -0
- input_output/outpu/images/img_11.png +0 -0
- input_output/outpu/images/img_12.png +0 -0
- input_output/outpu/images/img_13.png +0 -0
- input_output/outpu/images/img_14.png +0 -0
- input_output/outpu/images/img_15.png +0 -0
- input_output/outpu/images/img_16.png +0 -0
- input_output/outpu/images/img_17.png +0 -0
- input_output/outpu/images/img_18.png +0 -0
- input_output/outpu/images/img_19.png +0 -0
- input_output/outpu/images/img_2.png +0 -0
- input_output/outpu/images/img_20.png +0 -0
- input_output/outpu/images/img_21.png +0 -0
- input_output/outpu/images/img_22.png +0 -0
- input_output/outpu/images/img_23.png +0 -0
- input_output/outpu/images/img_24.png +0 -0
- input_output/outpu/images/img_25.png +0 -0
- input_output/outpu/images/img_26.png +0 -0
- input_output/outpu/images/img_3.png +0 -0
- input_output/outpu/images/img_4.png +0 -0
- input_output/outpu/images/img_5.png +0 -0
- input_output/outpu/images/img_6.png +0 -0
- input_output/outpu/images/img_7.png +0 -0
- input_output/outpu/images/img_8.png +0 -0
- input_output/outpu/images/img_9.png +0 -0
- input_output/output/final_output.md +0 -170
- input_output/output/images/img_1.png +0 -0
- input_output/output/images/img_1.png_rows/row_0/col_0.png +0 -0
- input_output/output/images/img_1.png_rows/row_1/col_0.png +0 -0
- input_output/output/images/img_1.png_rows/row_2/col_0.png +0 -0
- input_output/output/images/img_1.png_rows/row_3/col_0.png +0 -0
- input_output/output/images/img_1.png_rows/row_4/col_0.png +0 -0
- input_output/output/images/img_10.png +0 -0
- input_output/output/images/img_10.png_rows/row_0/col_0.png +0 -0
- input_output/output/images/img_10.png_rows/row_0/col_1.png +0 -0
- input_output/output/images/img_10.png_rows/row_1/col_0.png +0 -0
- input_output/output/images/img_10.png_rows/row_2/col_0.png +0 -0
- input_output/output/images/img_10.png_rows/row_2/col_1.png +0 -0
- input_output/output/images/img_11.png +0 -0
- input_output/output/images/img_11.png_rows/row_0/col_0.png +0 -0
- input_output/output/images/img_11.png_rows/row_0/col_1.png +0 -0
- input_output/output/images/img_11.png_rows/row_0/col_2.png +0 -0
- input_output/output/images/img_11.png_rows/row_1/col_0.png +0 -0
__pycache__/inference_svm_model.cpython-310.pyc
CHANGED
Binary files a/__pycache__/inference_svm_model.cpython-310.pyc and b/__pycache__/inference_svm_model.cpython-310.pyc differ
|
|
__pycache__/mineru_single.cpython-310.pyc
CHANGED
Binary files a/__pycache__/mineru_single.cpython-310.pyc and b/__pycache__/mineru_single.cpython-310.pyc differ
|
|
__pycache__/table_row_extraction.cpython-310.pyc
CHANGED
Binary files a/__pycache__/table_row_extraction.cpython-310.pyc and b/__pycache__/table_row_extraction.cpython-310.pyc differ
|
|
__pycache__/worker.cpython-310.pyc
CHANGED
Binary files a/__pycache__/worker.cpython-310.pyc and b/__pycache__/worker.cpython-310.pyc differ
|
|
contents_extractor_v2.py
DELETED
@@ -1,136 +0,0 @@
|
|
1 |
-
from google import genai
|
2 |
-
from google.genai import types
|
3 |
-
import fitz
|
4 |
-
import requests
|
5 |
-
|
6 |
-
MODEL = "gemini-2.0-flash"
|
7 |
-
|
8 |
-
# TODO: Make sure the last page must be included
|
9 |
-
|
10 |
-
|
11 |
-
class ContentsExtractor:
|
12 |
-
def __init__(self, api_key: str):
|
13 |
-
self.client = genai.Client(api_key=api_key)
|
14 |
-
|
15 |
-
@staticmethod
|
16 |
-
def extract_first_pages(pdf_path, num_pages=4, is_path_url=False):
|
17 |
-
try:
|
18 |
-
if is_path_url:
|
19 |
-
r = requests.get(pdf_path)
|
20 |
-
data = r.content
|
21 |
-
doc = fitz.open(stream=data, filetype="pdf")
|
22 |
-
else:
|
23 |
-
doc = fitz.open(pdf_path)
|
24 |
-
total_pages = doc.page_count
|
25 |
-
pages_to_read = min(total_pages, num_pages)
|
26 |
-
all_text = []
|
27 |
-
for page_num in range(pages_to_read):
|
28 |
-
page = doc[page_num]
|
29 |
-
page_text = page.get_text()
|
30 |
-
all_text.append(page_text)
|
31 |
-
doc.close()
|
32 |
-
return "\n".join(all_text)
|
33 |
-
except Exception as e:
|
34 |
-
print(f"Something went wrong: {e}")
|
35 |
-
return None
|
36 |
-
|
37 |
-
def extract_contents(self, content):
|
38 |
-
response = self.client.models.generate_content(
|
39 |
-
model=MODEL,
|
40 |
-
contents=[f"""
|
41 |
-
Task:
|
42 |
-
You will be provided with the first pages of an exam board document. Your goal is to extract
|
43 |
-
the main subject-related topics from the "Contents" section and structure them in a valid JSON format.
|
44 |
-
|
45 |
-
Instructions:
|
46 |
-
1. Identify the "Contents" section, which lists all topics, subtopics, and their corresponding pages.
|
47 |
-
2. Extract only the **highest-level, subject-related subtopics** (ignore organizational or administrative sections).
|
48 |
-
3. For subtopics, include the full range of pages from the first to the last subtopic.
|
49 |
-
4. Return the output in the following JSON format:
|
50 |
-
|
51 |
-
{{
|
52 |
-
"topic_name": [start_page, end_page]
|
53 |
-
}}
|
54 |
-
|
55 |
-
Important Notes:
|
56 |
-
- Ignore non-subject-related sections (e.g., "Introduction", "Exam Guidelines", "Appendices", "Assessment, Qualification at a glance").
|
57 |
-
- The extracted subtopics should represent major academic areas, not organizational or structural elements.
|
58 |
-
- Make sure that all of the pages for a subtopic are included, end page should be the start page of the topic
|
59 |
-
that comes next after the extracted one in contents section.
|
60 |
-
|
61 |
-
Examples:
|
62 |
-
1. Given this table of contents:
|
63 |
-
|
64 |
-
1 Introduction – 2
|
65 |
-
Why choose Edexcel A Level Mathematics? - 2
|
66 |
-
Supporting you in planning and implementing this qualification - 3
|
67 |
-
Qualification at a glance - 5
|
68 |
-
2 Subject content and assessment information – 7
|
69 |
-
Paper 1 and Paper 2: Pure Mathematics - 11
|
70 |
-
Paper 3: Statistics and Mechanics - 30
|
71 |
-
Assessment Objectives - 40
|
72 |
-
3 Administration and general information – 42
|
73 |
-
Entries - 42
|
74 |
-
Access arrangements, reasonable adjustments, special consideration and malpractice - 42
|
75 |
-
Student recruitment and progression - 45
|
76 |
-
Appendix 1: Formulae – 49
|
77 |
-
Appendix 2: Notation – 53
|
78 |
-
Appendix 3: Use of calculators – 59
|
79 |
-
Appendix 4: Assessment Objectives – 60
|
80 |
-
Appendix 5: The context for the development of this qualification – 62
|
81 |
-
Appendix 6: Transferable skills – 64
|
82 |
-
Appendix 7: Level 3 Extended Project qualification – 65
|
83 |
-
Appendix 8: Codes – 67
|
84 |
-
|
85 |
-
The correct output should be:
|
86 |
-
|
87 |
-
{{
|
88 |
-
"Paper 1 and Paper 2: Pure Mathematics": [11, 29],
|
89 |
-
"Paper 3: Statistics and Mechanics": [30, 42]
|
90 |
-
}}
|
91 |
-
|
92 |
-
2. Given this table of contents:
|
93 |
-
|
94 |
-
Qualification at a glance – 1
|
95 |
-
Assessment Objectives and weightings - 4
|
96 |
-
Knowledge, skills and understanding – 5
|
97 |
-
Theme 1: Introduction to markets and market failure - 5
|
98 |
-
Theme 2: The UK economy – performance and policies - 11
|
99 |
-
Theme 3: Business behaviour and the labour market - 21
|
100 |
-
Theme 4: A global perspective - 29
|
101 |
-
Assessment – 39
|
102 |
-
Assessment summary - 39
|
103 |
-
Assessment objectives - 41
|
104 |
-
Assessment overview - 42
|
105 |
-
Breakdown of assessment objectives - 42
|
106 |
-
Synoptic assessment - 43
|
107 |
-
Discount code and performance tables - 43
|
108 |
-
Access arrangements, reasonable adjustments and special consideration - 44
|
109 |
-
Malpractice - 45
|
110 |
-
Equality Act 2010 and Pearson equality policy - 45
|
111 |
-
Synoptic assessment - 46
|
112 |
-
Awarding and reporting - 47
|
113 |
-
Other information – 49
|
114 |
-
Student recruitment -49
|
115 |
-
Prior learning and other requirements -49
|
116 |
-
Progression - 49
|
117 |
-
Appendix 1: Transferable skills – 53
|
118 |
-
Appendix 2: Level 3 Extended Project qualification – 55
|
119 |
-
Appendix 3: Quantitative skills – 59
|
120 |
-
Appendix 4: Codes – 61
|
121 |
-
Appendix 5: Index – 63
|
122 |
-
|
123 |
-
The correct output should be:
|
124 |
-
|
125 |
-
{{
|
126 |
-
"Theme 1: Introduction to markets and market failure": [5, 10]
|
127 |
-
"Theme 2: The UK economy – performance and policies": - [11, 20]
|
128 |
-
"Theme 3: Business behaviour and the labour market": [21, 28]
|
129 |
-
"Theme 4: A global perspective": [29, 38]
|
130 |
-
}}
|
131 |
-
|
132 |
-
Now, extract topics from this text: {content}
|
133 |
-
"""],
|
134 |
-
config=types.GenerateContentConfig(temperature=0.)
|
135 |
-
)
|
136 |
-
return response.text.strip().replace("```json", "").replace("```", "")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
input_output/ocr-specification-economics.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d4e96b7e643af9f0f3777e44250470bac0b06cdb707ba62fed8b5b35dab6f581
|
3 |
+
size 9752567
|
input_output/outpu/images/img_1.png
DELETED
Binary file (213 kB)
|
|
input_output/outpu/images/img_10.png
DELETED
Binary file (402 kB)
|
|
input_output/outpu/images/img_11.png
DELETED
Binary file (391 kB)
|
|
input_output/outpu/images/img_12.png
DELETED
Binary file (368 kB)
|
|
input_output/outpu/images/img_13.png
DELETED
Binary file (431 kB)
|
|
input_output/outpu/images/img_14.png
DELETED
Binary file (378 kB)
|
|
input_output/outpu/images/img_15.png
DELETED
Binary file (423 kB)
|
|
input_output/outpu/images/img_16.png
DELETED
Binary file (366 kB)
|
|
input_output/outpu/images/img_17.png
DELETED
Binary file (516 kB)
|
|
input_output/outpu/images/img_18.png
DELETED
Binary file (385 kB)
|
|
input_output/outpu/images/img_19.png
DELETED
Binary file (442 kB)
|
|
input_output/outpu/images/img_2.png
DELETED
Binary file (171 kB)
|
|
input_output/outpu/images/img_20.png
DELETED
Binary file (123 kB)
|
|
input_output/outpu/images/img_21.png
DELETED
Binary file (318 kB)
|
|
input_output/outpu/images/img_22.png
DELETED
Binary file (462 kB)
|
|
input_output/outpu/images/img_23.png
DELETED
Binary file (426 kB)
|
|
input_output/outpu/images/img_24.png
DELETED
Binary file (469 kB)
|
|
input_output/outpu/images/img_25.png
DELETED
Binary file (385 kB)
|
|
input_output/outpu/images/img_26.png
DELETED
Binary file (357 kB)
|
|
input_output/outpu/images/img_3.png
DELETED
Binary file (268 kB)
|
|
input_output/outpu/images/img_4.png
DELETED
Binary file (285 kB)
|
|
input_output/outpu/images/img_5.png
DELETED
Binary file (377 kB)
|
|
input_output/outpu/images/img_6.png
DELETED
Binary file (405 kB)
|
|
input_output/outpu/images/img_7.png
DELETED
Binary file (334 kB)
|
|
input_output/outpu/images/img_8.png
DELETED
Binary file (396 kB)
|
|
input_output/outpu/images/img_9.png
DELETED
Binary file (508 kB)
|
|
input_output/output/final_output.md
DELETED
@@ -1,170 +0,0 @@
|
|
1 |
-

|
2 |
-

|
3 |
-

|
4 |
-

|
5 |
-

|
6 |
-

|
7 |
-

|
8 |
-

|
9 |
-

|
10 |
-

|
11 |
-

|
12 |
-

|
13 |
-

|
14 |
-

|
15 |
-

|
16 |
-

|
17 |
-

|
18 |
-

|
19 |
-

|
20 |
-

|
21 |
-

|
22 |
-

|
23 |
-

|
24 |
-

|
25 |
-

|
26 |
-

|
27 |
-

|
28 |
-

|
29 |
-

|
30 |
-

|
31 |
-

|
32 |
-

|
33 |
-

|
34 |
-

|
35 |
-

|
36 |
-

|
37 |
-

|
38 |
-

|
39 |
-

|
40 |
-

|
41 |
-

|
42 |
-

|
43 |
-

|
44 |
-

|
45 |
-

|
46 |
-

|
47 |
-

|
48 |
-

|
49 |
-

|
50 |
-

|
51 |
-

|
52 |
-

|
53 |
-

|
54 |
-

|
55 |
-

|
56 |
-

|
57 |
-

|
58 |
-

|
59 |
-

|
60 |
-

|
61 |
-

|
62 |
-

|
63 |
-

|
64 |
-

|
65 |
-

|
66 |
-

|
67 |
-

|
68 |
-

|
69 |
-

|
70 |
-

|
71 |
-

|
72 |
-

|
73 |
-

|
74 |
-

|
75 |
-

|
76 |
-

|
77 |
-

|
78 |
-

|
79 |
-

|
80 |
-

|
81 |
-

|
82 |
-

|
83 |
-

|
84 |
-

|
85 |
-

|
86 |
-

|
87 |
-

|
88 |
-

|
89 |
-

|
90 |
-

|
91 |
-

|
92 |
-

|
93 |
-

|
94 |
-

|
95 |
-

|
96 |
-

|
97 |
-

|
98 |
-

|
99 |
-

|
100 |
-

|
101 |
-

|
102 |
-

|
103 |
-

|
104 |
-

|
105 |
-

|
106 |
-

|
107 |
-

|
108 |
-

|
109 |
-

|
110 |
-

|
111 |
-

|
112 |
-

|
113 |
-

|
114 |
-

|
115 |
-

|
116 |
-

|
117 |
-

|
118 |
-

|
119 |
-

|
120 |
-

|
121 |
-

|
122 |
-

|
123 |
-

|
124 |
-

|
125 |
-

|
126 |
-

|
127 |
-

|
128 |
-

|
129 |
-

|
130 |
-

|
131 |
-

|
132 |
-

|
133 |
-

|
134 |
-

|
135 |
-

|
136 |
-

|
137 |
-

|
138 |
-

|
139 |
-

|
140 |
-

|
141 |
-

|
142 |
-

|
143 |
-

|
144 |
-

|
145 |
-

|
146 |
-

|
147 |
-

|
148 |
-

|
149 |
-

|
150 |
-

|
151 |
-

|
152 |
-

|
153 |
-

|
154 |
-

|
155 |
-

|
156 |
-

|
157 |
-

|
158 |
-

|
159 |
-

|
160 |
-

|
161 |
-

|
162 |
-

|
163 |
-

|
164 |
-

|
165 |
-

|
166 |
-

|
167 |
-

|
168 |
-

|
169 |
-

|
170 |
-

|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
input_output/output/images/img_1.png
DELETED
Binary file (213 kB)
|
|
input_output/output/images/img_1.png_rows/row_0/col_0.png
DELETED
Binary file (188 kB)
|
|
input_output/output/images/img_1.png_rows/row_1/col_0.png
DELETED
Binary file (57.6 kB)
|
|
input_output/output/images/img_1.png_rows/row_2/col_0.png
DELETED
Binary file (98 kB)
|
|
input_output/output/images/img_1.png_rows/row_3/col_0.png
DELETED
Binary file (57 kB)
|
|
input_output/output/images/img_1.png_rows/row_4/col_0.png
DELETED
Binary file (107 kB)
|
|
input_output/output/images/img_10.png
DELETED
Binary file (402 kB)
|
|
input_output/output/images/img_10.png_rows/row_0/col_0.png
DELETED
Binary file (41.2 kB)
|
|
input_output/output/images/img_10.png_rows/row_0/col_1.png
DELETED
Binary file (339 kB)
|
|
input_output/output/images/img_10.png_rows/row_1/col_0.png
DELETED
Binary file (211 kB)
|
|
input_output/output/images/img_10.png_rows/row_2/col_0.png
DELETED
Binary file (22.5 kB)
|
|
input_output/output/images/img_10.png_rows/row_2/col_1.png
DELETED
Binary file (403 kB)
|
|
input_output/output/images/img_11.png
DELETED
Binary file (391 kB)
|
|
input_output/output/images/img_11.png_rows/row_0/col_0.png
DELETED
Binary file (3.91 kB)
|
|
input_output/output/images/img_11.png_rows/row_0/col_1.png
DELETED
Binary file (7.13 kB)
|
|
input_output/output/images/img_11.png_rows/row_0/col_2.png
DELETED
Binary file (9.66 kB)
|
|
input_output/output/images/img_11.png_rows/row_1/col_0.png
DELETED
Binary file (24.8 kB)
|
|