correct page array handilng
Browse files- output/sample_spec_output.md +63 -0
- page_range.py +100 -58
- topic_extr.py +1 -4
output/sample_spec_output.md
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Paper 1 and Paper 2: Pure Mathematics
|
2 |
+
|
3 |
+
To support the co-teaching of this qualification with the AS Mathematics qualification, common content has been highlighted in bold..
|
4 |
+
|
5 |
+

|
6 |
+
|
7 |
+

|
8 |
+
|
9 |
+

|
10 |
+
|
11 |
+

|
12 |
+
|
13 |
+

|
14 |
+
|
15 |
+

|
16 |
+
|
17 |
+

|
18 |
+
|
19 |
+

|
20 |
+
|
21 |
+

|
22 |
+
|
23 |
+

|
24 |
+
|
25 |
+

|
26 |
+
|
27 |
+

|
28 |
+
|
29 |
+

|
30 |
+
|
31 |
+

|
32 |
+
|
33 |
+

|
34 |
+
|
35 |
+

|
36 |
+
|
37 |
+

|
38 |
+
|
39 |
+

|
40 |
+
|
41 |
+
# Paper 3: Statistics and Mechanics
|
42 |
+
|
43 |
+
All the Pure Mathematics content is assumed knowledge for Paper 3 and may be tested in parts of questions.
|
44 |
+
|
45 |
+
To support the co-teaching of this qualification with the AS Mathematics qualification, common content has been highlighted in bold..
|
46 |
+
|
47 |
+

|
48 |
+
|
49 |
+

|
50 |
+
|
51 |
+

|
52 |
+
|
53 |
+

|
54 |
+
|
55 |
+

|
56 |
+
|
57 |
+

|
58 |
+
|
59 |
+

|
60 |
+
|
61 |
+

|
62 |
+
|
63 |
+

|
page_range.py
CHANGED
@@ -5,7 +5,9 @@ import json
|
|
5 |
import logging
|
6 |
import fitz
|
7 |
import requests
|
|
|
8 |
from statistics import mode, median
|
|
|
9 |
|
10 |
from google import genai
|
11 |
from google.genai import types
|
@@ -62,7 +64,7 @@ You have the first pages of a PDF specification, including a table of contents.
|
|
62 |
Instructions:
|
63 |
1. Identify the 'Contents' section listing all topics, subtopics, and their corresponding pages.
|
64 |
2. Identify the major academic subtopics (common desired topic names "Paper X", "Theme X", "Content of X", "AS Unit X", "A2 Unit X", or similar headings).
|
65 |
-
3. For each subtopic, give the range of pages [start_page, end_page
|
66 |
4. Output only valid JSON of the form:
|
67 |
{{
|
68 |
"Subtopic A": [start_page, end_page],
|
@@ -72,6 +74,7 @@ Instructions:
|
|
72 |
Important notes:
|
73 |
- The correct "end_page" must be the page number of the next topic or subtopic minus 1.
|
74 |
- The final output must be valid JSON only, with no extra text or code blocks.
|
|
|
75 |
Examples:
|
76 |
1. Given this table of contents:
|
77 |
1 Introduction – 2
|
@@ -104,6 +107,7 @@ Assessment – 39
|
|
104 |
Assessment summary - 39
|
105 |
Assessment objectives - 41
|
106 |
Assessment overview - 42
|
|
|
107 |
|
108 |
The correct output should be:
|
109 |
{{
|
@@ -112,6 +116,57 @@ The correct output should be:
|
|
112 |
"Theme 3: Business behaviour and the labour market": [21, 28],
|
113 |
"Theme 4: A global perspective": [29, 38]
|
114 |
}}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
Now, extract topics from this text:
|
116 |
{first_pages_text}
|
117 |
"""
|
@@ -180,79 +235,66 @@ class TopicRangeExtractor:
|
|
180 |
total_pages = doc.page_count
|
181 |
doc.close()
|
182 |
|
183 |
-
# Compute global offset and adjust subtopic ranges.
|
184 |
if not subtopics:
|
185 |
-
|
186 |
-
subtopics_corrected = {}
|
187 |
-
else:
|
188 |
-
offset_candidates = []
|
189 |
-
subtopics_corrected = {}
|
190 |
-
for subname, rng in subtopics.items():
|
191 |
-
if not (isinstance(rng, list) and len(rng) == 2):
|
192 |
-
continue
|
193 |
-
start_p, end_p = rng
|
194 |
-
occs = find_all_occurrences(pdf_bytes, subname)
|
195 |
-
for p in occs:
|
196 |
-
candidate = p - (start_p - 1)
|
197 |
-
if candidate > 0:
|
198 |
-
offset_candidates.append(candidate)
|
199 |
-
subtopics_corrected[subname] = rng
|
200 |
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
|
210 |
-
|
211 |
-
adjusted_topics = {}
|
212 |
for subname, rng in subtopics_corrected.items():
|
213 |
start_p, end_p = rng
|
214 |
-
s0 = (start_p
|
215 |
e0 = (end_p - 1) + global_offset
|
216 |
-
|
217 |
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
effective_end = min(end, next_start - 1)
|
226 |
else:
|
227 |
-
|
228 |
-
|
229 |
|
230 |
-
# Build the union of pages from each effective range.
|
231 |
-
# For every topic except the last, use a half-open range to skip the boundary page.
|
232 |
real_pages_set = set()
|
233 |
-
for
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
if 0 <= pp < total_pages:
|
238 |
-
real_pages_set.add(pp)
|
239 |
-
else:
|
240 |
-
# For the last topic include the end page.
|
241 |
-
for pp in range(start, end + 1):
|
242 |
-
if 0 <= pp < total_pages:
|
243 |
-
real_pages_set.add(pp)
|
244 |
-
page_range = sorted(real_pages_set)
|
245 |
|
246 |
-
|
247 |
-
|
248 |
-
}
|
249 |
|
250 |
if __name__ == "__main__":
|
251 |
-
input_pdf = "/home/user/app/input_output/
|
252 |
gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
|
253 |
try:
|
254 |
extractor = TopicRangeExtractor(gemini_api_key=gemini_key)
|
255 |
result = extractor.process(input_pdf)
|
256 |
-
print(json.dumps(result, indent=2))
|
257 |
except Exception as e:
|
258 |
logger.error(f"Processing failed: {e}")
|
|
|
5 |
import logging
|
6 |
import fitz
|
7 |
import requests
|
8 |
+
import time
|
9 |
from statistics import mode, median
|
10 |
+
from typing import Dict, List, Tuple
|
11 |
|
12 |
from google import genai
|
13 |
from google.genai import types
|
|
|
64 |
Instructions:
|
65 |
1. Identify the 'Contents' section listing all topics, subtopics, and their corresponding pages.
|
66 |
2. Identify the major academic subtopics (common desired topic names "Paper X", "Theme X", "Content of X", "AS Unit X", "A2 Unit X", or similar headings).
|
67 |
+
3. For each subtopic, give the range of pages [start_page, end_page] (1-based) from the table of contents.
|
68 |
4. Output only valid JSON of the form:
|
69 |
{{
|
70 |
"Subtopic A": [start_page, end_page],
|
|
|
74 |
Important notes:
|
75 |
- The correct "end_page" must be the page number of the next topic or subtopic minus 1.
|
76 |
- The final output must be valid JSON only, with no extra text or code blocks.
|
77 |
+
|
78 |
Examples:
|
79 |
1. Given this table of contents:
|
80 |
1 Introduction – 2
|
|
|
107 |
Assessment summary - 39
|
108 |
Assessment objectives - 41
|
109 |
Assessment overview - 42
|
110 |
+
Breakdown of assessment objectives - 42
|
111 |
|
112 |
The correct output should be:
|
113 |
{{
|
|
|
116 |
"Theme 3: Business behaviour and the labour market": [21, 28],
|
117 |
"Theme 4: A global perspective": [29, 38]
|
118 |
}}
|
119 |
+
|
120 |
+
3. You might also see sections like:
|
121 |
+
2.1 AS Unit 1 11
|
122 |
+
2.2 AS Unit 2 18
|
123 |
+
2.3 A2 Unit 3 24
|
124 |
+
2.4 A2 Unit 4 31
|
125 |
+
In that scenario, your output might look like:
|
126 |
+
{{
|
127 |
+
"2.1 AS Unit 1": [11, 17],
|
128 |
+
"2.2 AS Unit 2": [18, 23],
|
129 |
+
"2.3 A2 Unit 3": [24, 30],
|
130 |
+
"2.4 A2 Unit 4": [31, 35]
|
131 |
+
}}
|
132 |
+
or
|
133 |
+
2.1 AS units 6
|
134 |
+
2.2 AS units 23
|
135 |
+
In that scenario, your output might look like:
|
136 |
+
{{
|
137 |
+
"2.1 AS Unit 1": [6, 2],
|
138 |
+
"2.2 AS Unit 2": [23, 43]
|
139 |
+
}}
|
140 |
+
|
141 |
+
4. Another example might list subtopics:
|
142 |
+
3.1 Overarching themes 11
|
143 |
+
3.2 A: Proof 12
|
144 |
+
3.3 B: Algebra and functions 13
|
145 |
+
3.4 C: Coordinate geometry in the ( x , y ) plane 14
|
146 |
+
3.5 D: Sequences and series 15
|
147 |
+
3.6 E: Trigonometry 16
|
148 |
+
3.7 F: Exponentials and logarithms 17
|
149 |
+
3.8 G: Differentiation 18
|
150 |
+
3.9 H: Integration 19
|
151 |
+
3.10 I: Numerical methods 20
|
152 |
+
3.11 J: Vectors 20
|
153 |
+
3.12 K: Statistical sampling 21
|
154 |
+
3.13 L: Data presentation and interpretation 21
|
155 |
+
3.14 M: Probability 22
|
156 |
+
3.15 N: Statistical distributions 23
|
157 |
+
3.16 O: Statistical hypothesis testing 23
|
158 |
+
3.17 P: Quantities and units in mechanics 24
|
159 |
+
3.18 Q: Kinematics 24
|
160 |
+
3.19 R: Forces and Newton’s laws 24
|
161 |
+
3.20 S: Moments 25
|
162 |
+
3.21 Use of data in statistics 26
|
163 |
+
|
164 |
+
Here the correct output might look like:
|
165 |
+
{{
|
166 |
+
"A: Proof": [12, 12],
|
167 |
+
"B: Algebra and functions": [13, 13],
|
168 |
+
...
|
169 |
+
}}
|
170 |
Now, extract topics from this text:
|
171 |
{first_pages_text}
|
172 |
"""
|
|
|
235 |
total_pages = doc.page_count
|
236 |
doc.close()
|
237 |
|
|
|
238 |
if not subtopics:
|
239 |
+
return {"page_range": list(range(total_pages))}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
240 |
|
241 |
+
offset_candidates = []
|
242 |
+
subtopics_corrected = {}
|
243 |
+
for subname, rng in subtopics.items():
|
244 |
+
if not (isinstance(rng, list) and len(rng) == 2):
|
245 |
+
continue
|
246 |
+
start_p, end_p = rng
|
247 |
+
occs = find_all_occurrences(pdf_bytes, subname)
|
248 |
+
for p in occs:
|
249 |
+
candidate = p - (start_p - 1)
|
250 |
+
if candidate > 0:
|
251 |
+
offset_candidates.append(candidate)
|
252 |
+
|
253 |
+
subtopics_corrected[subname] = rng
|
254 |
+
|
255 |
+
if offset_candidates:
|
256 |
+
try:
|
257 |
+
global_offset = mode(offset_candidates)
|
258 |
+
except Exception:
|
259 |
+
global_offset = int(median(offset_candidates))
|
260 |
+
else:
|
261 |
+
global_offset = 0
|
262 |
+
logger.info(f"Computed global offset: {global_offset}")
|
263 |
|
264 |
+
adjusted_subtopics = []
|
|
|
265 |
for subname, rng in subtopics_corrected.items():
|
266 |
start_p, end_p = rng
|
267 |
+
s0 = (start_p) + global_offset
|
268 |
e0 = (end_p - 1) + global_offset
|
269 |
+
adjusted_subtopics.append((subname, (s0, e0)))
|
270 |
|
271 |
+
sorted_subtopics = sorted(adjusted_subtopics, key=lambda x: x[1][0])
|
272 |
+
final_subtopics = []
|
273 |
+
for i in range(len(sorted_subtopics)):
|
274 |
+
subname, (s0, e0) = sorted_subtopics[i]
|
275 |
+
if i < len(sorted_subtopics) - 1:
|
276 |
+
next_s0 = sorted_subtopics[i + 1][1][0]
|
277 |
+
new_e0 = min(e0, next_s0 - 1)
|
|
|
278 |
else:
|
279 |
+
new_e0 = min(e0, total_pages - 1)
|
280 |
+
final_subtopics.append((subname, (s0, new_e0)))
|
281 |
|
|
|
|
|
282 |
real_pages_set = set()
|
283 |
+
for subname, (s0, e0) in final_subtopics:
|
284 |
+
for pp in range(s0, e0 + 1):
|
285 |
+
if 0 <= pp < total_pages:
|
286 |
+
real_pages_set.add(pp)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
287 |
|
288 |
+
page_range = sorted(real_pages_set)
|
289 |
+
logger.info(f"Final page range: {page_range}")
|
290 |
+
return {"page_range": page_range}
|
291 |
|
292 |
if __name__ == "__main__":
|
293 |
+
input_pdf = "/home/user/app/input_output/pearson-A_Level_Economics.pdf"
|
294 |
gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
|
295 |
try:
|
296 |
extractor = TopicRangeExtractor(gemini_api_key=gemini_key)
|
297 |
result = extractor.process(input_pdf)
|
298 |
+
# print(json.dumps(result, indent=2))
|
299 |
except Exception as e:
|
300 |
logger.error(f"Processing failed: {e}")
|
topic_extr.py
CHANGED
@@ -184,10 +184,7 @@ def main():
|
|
184 |
"key": "sample_spec",
|
185 |
"url": "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf",
|
186 |
"type": "specification",
|
187 |
-
"page":
|
188 |
-
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
|
189 |
-
28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39, 40, 41
|
190 |
-
]
|
191 |
}
|
192 |
],
|
193 |
"topics": [
|
|
|
184 |
"key": "sample_spec",
|
185 |
"url": "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf",
|
186 |
"type": "specification",
|
187 |
+
"page": [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39, 40, 41, 42]
|
|
|
|
|
|
|
188 |
}
|
189 |
],
|
190 |
"topics": [
|