SkyNait commited on
Commit
5e4a36e
·
1 Parent(s): 9351a05

correct page array handilng

Browse files
Files changed (3) hide show
  1. output/sample_spec_output.md +63 -0
  2. page_range.py +100 -58
  3. topic_extr.py +1 -4
output/sample_spec_output.md ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Paper 1 and Paper 2: Pure Mathematics
2
+
3
+ To support the co-teaching of this qualification with the AS Mathematics qualification, common content has been highlighted in bold..
4
+
5
+ ![](/topic-extraction/8ad59648516f3e9564f0e5df8114f87cd48c2fe5f34b15c28c704962b31adc70.jpg)
6
+
7
+ ![](/topic-extraction/8116200eb839fa0c6d87bb6e96db29559283cc3d7de7ff3834326012ca2d37e3.jpg)
8
+
9
+ ![](/topic-extraction/4cc2bdaa64251411d29493fafb406ad9974260459b247be498e312e29b969a15.jpg)
10
+
11
+ ![](/topic-extraction/a06c5ac3695ab4caff0dc2724c9a8a288fefc94cb1b79e370975be31d3869230.jpg)
12
+
13
+ ![](/topic-extraction/c5631064f99712df9f9591a603ae00098be039f661264ce67925a18c90e06142.jpg)
14
+
15
+ ![](/topic-extraction/88fdba19faed0f761e041fbf040b8cfc57c73bdf36fcd6e32f59f09ac91aeab6.jpg)
16
+
17
+ ![](/topic-extraction/a9a669c1c64b92583f2cc72a8216854bb76c90586a0615afdaee9f0d26d120e9.jpg)
18
+
19
+ ![](/topic-extraction/6b5c9d3211ba2d7c95de68ed81f03fc32f1aba22d55e8ba53fb4586fdb270426.jpg)
20
+
21
+ ![](/topic-extraction/310a9b6f2764de2b165de3343fea3e64ddbd36f8d43c5962dd48730a9e729019.jpg)
22
+
23
+ ![](/topic-extraction/4d3fa5997973de85edbf15b31c91f5d1822c5d9698cbb953d1cd9fff04fca369.jpg)
24
+
25
+ ![](/topic-extraction/480483c021c62d2499f240729e15a6aae16aa6d3be9aec2c65a16e6dd6b878e5.jpg)
26
+
27
+ ![](/topic-extraction/4b2d26dfff554e5c0e2e33968ea3fbae882e9deec5aa2607288ac72f05fbc093.jpg)
28
+
29
+ ![](/topic-extraction/de35db590f61b05cf88744cd89789d664a6abb48c94ef6fb2f380404e0b6aa56.jpg)
30
+
31
+ ![](/topic-extraction/230e72098ba7930d8338b8c0bc7c184e7129ec59141952e7c57f127655a00164.jpg)
32
+
33
+ ![](/topic-extraction/0247eaaab6c95cbc124fa87c44936e2d9963699fd3bc7522596f997029426354.jpg)
34
+
35
+ ![](/topic-extraction/e95d6f913ef911a562b5c5c0e336cf6265c90753738fbf1fb5b86a0370573286.jpg)
36
+
37
+ ![](/topic-extraction/e75eb0c3ddebeb5cdef32f0a4281f98c0f435792630fd5cf2a60827fed6496ae.jpg)
38
+
39
+ ![](/topic-extraction/6db71c2167c71b32503e4025534a9111558ee893c9b94335f73e8d965bdb3e7f.jpg)
40
+
41
+ # Paper 3: Statistics and Mechanics
42
+
43
+ All the Pure Mathematics content is assumed knowledge for Paper 3 and may be tested in parts of questions.
44
+
45
+ To support the co-teaching of this qualification with the AS Mathematics qualification, common content has been highlighted in bold..
46
+
47
+ ![](/topic-extraction/bdc8dba766b71c8baa1fa78425fa9b05960de72fa2e3cd58acec0ed9f6a38484.jpg)
48
+
49
+ ![](/topic-extraction/8a7e0f0815ec510978f1e4629f452be0f698ae3b2b73fdd0c6cb6d01b73c658d.jpg)
50
+
51
+ ![](/topic-extraction/c0f6c78a4393655d252cf16cf91690f5b853c925eef73e15ce9473f6039518e8.jpg)
52
+
53
+ ![](/topic-extraction/d8fc74d90978852def7740a09c94949a8b30a37248555561f4997f4d40bad7b1.jpg)
54
+
55
+ ![](/topic-extraction/c27edd49d1ff81e5e31321b53fc559bac988181af672cda7fe65fb17e48fd674.jpg)
56
+
57
+ ![](/topic-extraction/f82f21d337bc60d0dc797db76b5738144904989fb044160d9fcceaa41651aa33.jpg)
58
+
59
+ ![](/topic-extraction/74059e4d980d876dec0451f14e791402349da955dda7308450dccc287bed0147.jpg)
60
+
61
+ ![](/topic-extraction/263c0b8a692bad208c16544fd15d1b12c10dae66e88f3067e4c34932af7eebc4.jpg)
62
+
63
+ ![](/topic-extraction/80919764b501319dc4a0fd6715bd31192ad14c7090ed0aed89eabef833b7622e.jpg)
page_range.py CHANGED
@@ -5,7 +5,9 @@ import json
5
  import logging
6
  import fitz
7
  import requests
 
8
  from statistics import mode, median
 
9
 
10
  from google import genai
11
  from google.genai import types
@@ -62,7 +64,7 @@ You have the first pages of a PDF specification, including a table of contents.
62
  Instructions:
63
  1. Identify the 'Contents' section listing all topics, subtopics, and their corresponding pages.
64
  2. Identify the major academic subtopics (common desired topic names "Paper X", "Theme X", "Content of X", "AS Unit X", "A2 Unit X", or similar headings).
65
- 3. For each subtopic, give the range of pages [start_page, end_page -1] (1-based) from the table of contents.
66
  4. Output only valid JSON of the form:
67
  {{
68
  "Subtopic A": [start_page, end_page],
@@ -72,6 +74,7 @@ Instructions:
72
  Important notes:
73
  - The correct "end_page" must be the page number of the next topic or subtopic minus 1.
74
  - The final output must be valid JSON only, with no extra text or code blocks.
 
75
  Examples:
76
  1. Given this table of contents:
77
  1 Introduction – 2
@@ -104,6 +107,7 @@ Assessment – 39
104
  Assessment summary - 39
105
  Assessment objectives - 41
106
  Assessment overview - 42
 
107
 
108
  The correct output should be:
109
  {{
@@ -112,6 +116,57 @@ The correct output should be:
112
  "Theme 3: Business behaviour and the labour market": [21, 28],
113
  "Theme 4: A global perspective": [29, 38]
114
  }}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  Now, extract topics from this text:
116
  {first_pages_text}
117
  """
@@ -180,79 +235,66 @@ class TopicRangeExtractor:
180
  total_pages = doc.page_count
181
  doc.close()
182
 
183
- # Compute global offset and adjust subtopic ranges.
184
  if not subtopics:
185
- global_offset = 0
186
- subtopics_corrected = {}
187
- else:
188
- offset_candidates = []
189
- subtopics_corrected = {}
190
- for subname, rng in subtopics.items():
191
- if not (isinstance(rng, list) and len(rng) == 2):
192
- continue
193
- start_p, end_p = rng
194
- occs = find_all_occurrences(pdf_bytes, subname)
195
- for p in occs:
196
- candidate = p - (start_p - 1)
197
- if candidate > 0:
198
- offset_candidates.append(candidate)
199
- subtopics_corrected[subname] = rng
200
 
201
- if offset_candidates:
202
- try:
203
- global_offset = mode(offset_candidates)
204
- except Exception:
205
- global_offset = int(median(offset_candidates))
206
- else:
207
- global_offset = 0
208
- logger.info(f"Computed global offset: {global_offset}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
 
210
- # Adjust ranges by applying the global offset.
211
- adjusted_topics = {}
212
  for subname, rng in subtopics_corrected.items():
213
  start_p, end_p = rng
214
- s0 = (start_p - 1) + global_offset
215
  e0 = (end_p - 1) + global_offset
216
- adjusted_topics[subname] = [s0, e0]
217
 
218
- # Sort the topics by their adjusted start page.
219
- sorted_topics = sorted(adjusted_topics.items(), key=lambda item: item[1][0])
220
- effective_ranges = {}
221
- # For each subtopic, if there is a next one, set its effective end to the next topic's start minus 1.
222
- for i, (name, (start, end)) in enumerate(sorted_topics):
223
- if i < len(sorted_topics) - 1:
224
- next_start = sorted_topics[i+1][1][0]
225
- effective_end = min(end, next_start - 1)
226
  else:
227
- effective_end = end
228
- effective_ranges[name] = [start, effective_end]
229
 
230
- # Build the union of pages from each effective range.
231
- # For every topic except the last, use a half-open range to skip the boundary page.
232
  real_pages_set = set()
233
- for i, (name, (start, end)) in enumerate(sorted_topics):
234
- if i < len(sorted_topics) - 1:
235
- # End is exclusive so the boundary page (end) is skipped.
236
- for pp in range(start, end):
237
- if 0 <= pp < total_pages:
238
- real_pages_set.add(pp)
239
- else:
240
- # For the last topic include the end page.
241
- for pp in range(start, end + 1):
242
- if 0 <= pp < total_pages:
243
- real_pages_set.add(pp)
244
- page_range = sorted(real_pages_set)
245
 
246
- return {
247
- "page_range": page_range
248
- }
249
 
250
  if __name__ == "__main__":
251
- input_pdf = "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf"
252
  gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
253
  try:
254
  extractor = TopicRangeExtractor(gemini_api_key=gemini_key)
255
  result = extractor.process(input_pdf)
256
- print(json.dumps(result, indent=2))
257
  except Exception as e:
258
  logger.error(f"Processing failed: {e}")
 
5
  import logging
6
  import fitz
7
  import requests
8
+ import time
9
  from statistics import mode, median
10
+ from typing import Dict, List, Tuple
11
 
12
  from google import genai
13
  from google.genai import types
 
64
  Instructions:
65
  1. Identify the 'Contents' section listing all topics, subtopics, and their corresponding pages.
66
  2. Identify the major academic subtopics (common desired topic names "Paper X", "Theme X", "Content of X", "AS Unit X", "A2 Unit X", or similar headings).
67
+ 3. For each subtopic, give the range of pages [start_page, end_page] (1-based) from the table of contents.
68
  4. Output only valid JSON of the form:
69
  {{
70
  "Subtopic A": [start_page, end_page],
 
74
  Important notes:
75
  - The correct "end_page" must be the page number of the next topic or subtopic minus 1.
76
  - The final output must be valid JSON only, with no extra text or code blocks.
77
+
78
  Examples:
79
  1. Given this table of contents:
80
  1 Introduction – 2
 
107
  Assessment summary - 39
108
  Assessment objectives - 41
109
  Assessment overview - 42
110
+ Breakdown of assessment objectives - 42
111
 
112
  The correct output should be:
113
  {{
 
116
  "Theme 3: Business behaviour and the labour market": [21, 28],
117
  "Theme 4: A global perspective": [29, 38]
118
  }}
119
+
120
+ 3. You might also see sections like:
121
+ 2.1 AS Unit 1 11
122
+ 2.2 AS Unit 2 18
123
+ 2.3 A2 Unit 3 24
124
+ 2.4 A2 Unit 4 31
125
+ In that scenario, your output might look like:
126
+ {{
127
+ "2.1 AS Unit 1": [11, 17],
128
+ "2.2 AS Unit 2": [18, 23],
129
+ "2.3 A2 Unit 3": [24, 30],
130
+ "2.4 A2 Unit 4": [31, 35]
131
+ }}
132
+ or
133
+ 2.1 AS units 6
134
+ 2.2 AS units 23
135
+ In that scenario, your output might look like:
136
+ {{
137
+ "2.1 AS Unit 1": [6, 2],
138
+ "2.2 AS Unit 2": [23, 43]
139
+ }}
140
+
141
+ 4. Another example might list subtopics:
142
+ 3.1 Overarching themes 11
143
+ 3.2 A: Proof 12
144
+ 3.3 B: Algebra and functions 13
145
+ 3.4 C: Coordinate geometry in the ( x , y ) plane 14
146
+ 3.5 D: Sequences and series 15
147
+ 3.6 E: Trigonometry 16
148
+ 3.7 F: Exponentials and logarithms 17
149
+ 3.8 G: Differentiation 18
150
+ 3.9 H: Integration 19
151
+ 3.10 I: Numerical methods 20
152
+ 3.11 J: Vectors 20
153
+ 3.12 K: Statistical sampling 21
154
+ 3.13 L: Data presentation and interpretation 21
155
+ 3.14 M: Probability 22
156
+ 3.15 N: Statistical distributions 23
157
+ 3.16 O: Statistical hypothesis testing 23
158
+ 3.17 P: Quantities and units in mechanics 24
159
+ 3.18 Q: Kinematics 24
160
+ 3.19 R: Forces and Newton’s laws 24
161
+ 3.20 S: Moments 25
162
+ 3.21 Use of data in statistics 26
163
+
164
+ Here the correct output might look like:
165
+ {{
166
+ "A: Proof": [12, 12],
167
+ "B: Algebra and functions": [13, 13],
168
+ ...
169
+ }}
170
  Now, extract topics from this text:
171
  {first_pages_text}
172
  """
 
235
  total_pages = doc.page_count
236
  doc.close()
237
 
 
238
  if not subtopics:
239
+ return {"page_range": list(range(total_pages))}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
 
241
+ offset_candidates = []
242
+ subtopics_corrected = {}
243
+ for subname, rng in subtopics.items():
244
+ if not (isinstance(rng, list) and len(rng) == 2):
245
+ continue
246
+ start_p, end_p = rng
247
+ occs = find_all_occurrences(pdf_bytes, subname)
248
+ for p in occs:
249
+ candidate = p - (start_p - 1)
250
+ if candidate > 0:
251
+ offset_candidates.append(candidate)
252
+
253
+ subtopics_corrected[subname] = rng
254
+
255
+ if offset_candidates:
256
+ try:
257
+ global_offset = mode(offset_candidates)
258
+ except Exception:
259
+ global_offset = int(median(offset_candidates))
260
+ else:
261
+ global_offset = 0
262
+ logger.info(f"Computed global offset: {global_offset}")
263
 
264
+ adjusted_subtopics = []
 
265
  for subname, rng in subtopics_corrected.items():
266
  start_p, end_p = rng
267
+ s0 = (start_p) + global_offset
268
  e0 = (end_p - 1) + global_offset
269
+ adjusted_subtopics.append((subname, (s0, e0)))
270
 
271
+ sorted_subtopics = sorted(adjusted_subtopics, key=lambda x: x[1][0])
272
+ final_subtopics = []
273
+ for i in range(len(sorted_subtopics)):
274
+ subname, (s0, e0) = sorted_subtopics[i]
275
+ if i < len(sorted_subtopics) - 1:
276
+ next_s0 = sorted_subtopics[i + 1][1][0]
277
+ new_e0 = min(e0, next_s0 - 1)
 
278
  else:
279
+ new_e0 = min(e0, total_pages - 1)
280
+ final_subtopics.append((subname, (s0, new_e0)))
281
 
 
 
282
  real_pages_set = set()
283
+ for subname, (s0, e0) in final_subtopics:
284
+ for pp in range(s0, e0 + 1):
285
+ if 0 <= pp < total_pages:
286
+ real_pages_set.add(pp)
 
 
 
 
 
 
 
 
287
 
288
+ page_range = sorted(real_pages_set)
289
+ logger.info(f"Final page range: {page_range}")
290
+ return {"page_range": page_range}
291
 
292
  if __name__ == "__main__":
293
+ input_pdf = "/home/user/app/input_output/pearson-A_Level_Economics.pdf"
294
  gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
295
  try:
296
  extractor = TopicRangeExtractor(gemini_api_key=gemini_key)
297
  result = extractor.process(input_pdf)
298
+ # print(json.dumps(result, indent=2))
299
  except Exception as e:
300
  logger.error(f"Processing failed: {e}")
topic_extr.py CHANGED
@@ -184,10 +184,7 @@ def main():
184
  "key": "sample_spec",
185
  "url": "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf",
186
  "type": "specification",
187
- "page": [
188
- 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
189
- 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39, 40, 41
190
- ]
191
  }
192
  ],
193
  "topics": [
 
184
  "key": "sample_spec",
185
  "url": "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf",
186
  "type": "specification",
187
+ "page": [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39, 40, 41, 42]
 
 
 
188
  }
189
  ],
190
  "topics": [