SkyNait commited on
Commit
2c394b4
·
1 Parent(s): 145c342

correct JSON and filtering

Browse files
input_output/wjec-gce-as-a-economics-specification-from-2015.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eef6fa3102f82c9a3e0eb99a8c7a08f86df01c2ba7636ff4bef8cbd7f780e7b6
3
+ size 3543551
pearson_json/_subtopics.json ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "title": "Content",
4
+ "contents": [
5
+ {
6
+ "type": "image",
7
+ "key": "/topic-extraction/cells/img_1.jpg_r0_c0.png"
8
+ },
9
+ {
10
+ "type": "image",
11
+ "key": "/topic-extraction/cells/img_3.jpg_r0_c0.png"
12
+ },
13
+ {
14
+ "type": "image",
15
+ "key": "/topic-extraction/cells/img_4.jpg_r1_c0.png"
16
+ },
17
+ {
18
+ "type": "image",
19
+ "key": "/topic-extraction/cells/img_5.jpg_r0_c0.png"
20
+ },
21
+ {
22
+ "type": "image",
23
+ "key": "/topic-extraction/cells/img_6.jpg_r0_c0.png"
24
+ },
25
+ {
26
+ "type": "image",
27
+ "key": "/topic-extraction/cells/img_9.jpg_r0_c0.png"
28
+ },
29
+ {
30
+ "type": "image",
31
+ "key": "/topic-extraction/cells/img_15.jpg_r0_c0.png"
32
+ },
33
+ {
34
+ "type": "image",
35
+ "key": "/topic-extraction/cells/img_16.jpg_r0_c0.png"
36
+ },
37
+ {
38
+ "type": "image",
39
+ "key": "/topic-extraction/cells/img_18.jpg_r0_c0.png"
40
+ },
41
+ {
42
+ "type": "image",
43
+ "key": "/topic-extraction/cells/img_19.jpg_r0_c0.png"
44
+ },
45
+ {
46
+ "type": "image",
47
+ "key": "/topic-extraction/cells/img_20.jpg_r0_c0.png"
48
+ },
49
+ {
50
+ "type": "image",
51
+ "key": "/topic-extraction/cells/img_22.jpg_r0_c0.png"
52
+ },
53
+ {
54
+ "type": "image",
55
+ "key": "/topic-extraction/cells/img_23.jpg_r0_c0.png"
56
+ },
57
+ {
58
+ "type": "image",
59
+ "key": "/topic-extraction/cells/img_27.jpg_r0_c0.png"
60
+ },
61
+ {
62
+ "type": "image",
63
+ "key": "/topic-extraction/cells/img_28.jpg_r0_c0.png"
64
+ },
65
+ {
66
+ "type": "image",
67
+ "key": "/topic-extraction/cells/img_29.jpg_r0_c0.png"
68
+ }
69
+ ],
70
+ "children": []
71
+ },
72
+ {
73
+ "title": "Factors influencing demand and supply in product markets",
74
+ "contents": [
75
+ {
76
+ "type": "image",
77
+ "key": "/topic-extraction/cells/img_2.jpg_r1_c0.png"
78
+ }
79
+ ],
80
+ "children": []
81
+ },
82
+ {
83
+ "title": "Why and how governments intervene in markets",
84
+ "contents": [
85
+ {
86
+ "type": "image",
87
+ "key": "/topic-extraction/cells/img_7.jpg_r1_c0.png"
88
+ }
89
+ ],
90
+ "children": []
91
+ },
92
+ {
93
+ "title": "The circular flow of income model",
94
+ "contents": [
95
+ {
96
+ "type": "image",
97
+ "key": "/topic-extraction/cells/img_8.jpg_r2_c0.png"
98
+ }
99
+ ],
100
+ "children": []
101
+ },
102
+ {
103
+ "title": "Government policy objectives",
104
+ "contents": [
105
+ {
106
+ "type": "image",
107
+ "key": "/topic-extraction/cells/img_10.jpg_r1_c0.png"
108
+ }
109
+ ],
110
+ "children": []
111
+ },
112
+ {
113
+ "title": "Fiscal policy",
114
+ "contents": [
115
+ {
116
+ "type": "image",
117
+ "key": "/topic-extraction/cells/img_11.jpg_r1_c0.png"
118
+ }
119
+ ],
120
+ "children": []
121
+ },
122
+ {
123
+ "title": "Monetary policy",
124
+ "contents": [
125
+ {
126
+ "type": "image",
127
+ "key": "/topic-extraction/cells/img_12.jpg_r1_c0.png"
128
+ }
129
+ ],
130
+ "children": []
131
+ },
132
+ {
133
+ "title": "Exchange rates and exchange rate policy",
134
+ "contents": [
135
+ {
136
+ "type": "image",
137
+ "key": "/topic-extraction/cells/img_13.jpg_r1_c0.png"
138
+ }
139
+ ],
140
+ "children": []
141
+ },
142
+ {
143
+ "title": "Free trade and protectionism",
144
+ "contents": [
145
+ {
146
+ "type": "image",
147
+ "key": "/topic-extraction/cells/img_14.jpg_r1_c0.png"
148
+ }
149
+ ],
150
+ "children": []
151
+ },
152
+ {
153
+ "title": "Monopoly",
154
+ "contents": [
155
+ {
156
+ "type": "image",
157
+ "key": "/topic-extraction/cells/img_17.jpg_r2_c0.png"
158
+ }
159
+ ],
160
+ "children": []
161
+ },
162
+ {
163
+ "title": "Economic growth",
164
+ "contents": [
165
+ {
166
+ "type": "image",
167
+ "key": "/topic-extraction/cells/img_21.jpg_r1_c0.png"
168
+ }
169
+ ],
170
+ "children": []
171
+ },
172
+ {
173
+ "title": "Inflation and deflation",
174
+ "contents": [
175
+ {
176
+ "type": "image",
177
+ "key": "/topic-extraction/cells/img_24.jpg_r1_c0.png"
178
+ }
179
+ ],
180
+ "children": []
181
+ },
182
+ {
183
+ "title": "The balance of payments",
184
+ "contents": [
185
+ {
186
+ "type": "image",
187
+ "key": "/topic-extraction/cells/img_25.jpg_r2_c0.png"
188
+ }
189
+ ],
190
+ "children": []
191
+ },
192
+ {
193
+ "title": "Control of the national (public sector) debt",
194
+ "contents": [
195
+ {
196
+ "type": "image",
197
+ "key": "/topic-extraction/cells/img_26.jpg_r1_c0.png"
198
+ }
199
+ ],
200
+ "children": []
201
+ }
202
+ ]
topic_extr.py CHANGED
@@ -207,6 +207,13 @@ class s3Writer:
207
  logger.error(f"Failed to upload to S3: {str(e)}")
208
  raise
209
 
 
 
 
 
 
 
 
210
  def preprocess_image(image_data: bytes, max_dim: int = 600, quality: int = 60) -> bytes:
211
  arr = np.frombuffer(image_data, np.uint8)
212
  img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
@@ -238,11 +245,6 @@ The two-column 'table' image includes such key features:
238
  - Two columns
239
  - Headers like 'Subject content', 'Additional information'
240
  - Possibly sections (e.g. 2.1, 3.4, G2, G3, )
241
- The empty image include such key features:
242
- - Does not include anything at all (like a blank white or black image)
243
- - Truncated image with words like 'Subject content', 'What students need to learn' with blue background.
244
- - Truncated image with words like 'Topics', 'What students need to learn', 'Content' with grey background ((166, 166, 166) or (180,180,180) RGB color code).
245
- If the image is an empty image, respond with 'EMPTY_IMAGE'.
246
  If the image is a relevant table with 2 columns, respond with 'TWO_COLUMN'.
247
  If the image is a relevant table with 3 columns, respond with 'THREE_COLUMN'.
248
  If the image is non-empty but does not show a table, respond with 'NO_TABLE'.
@@ -309,7 +311,7 @@ Your task is to extract:
309
 
310
  Follow these rules:
311
 
312
- (1) **If the cell shows a main topic in the format "<number> <Topic Name>",** for example "2 Algebra and functions continued", then:
313
  - Put that text (without the word "continued") in "title". (e.g. "2 Algebra and functions")
314
  - "subtopics" should be an empty array, unless you also see smaller subtopic numbers.
315
 
@@ -318,11 +320,11 @@ Follow these rules:
318
  - "title" in this case should be an empty string if you only detect subtopics.
319
  (Example: If text is "2.5 Solve linear inequalities...", then "title" = "", "subtopics" = ["2.5"]).
320
 
321
- (3) **If neither a main topic nor a subtopic is detected,** return empty values:
322
- {
323
  "title": "",
324
  "subtopics": []
325
- }
326
 
327
  (4) **If there is no numeric value in the left column** (e.g. "2.1" or "2 <Topic name>" not found) but the left column text appears to be a heading (for instance "Scarcity, choice and opportunity cost"), then:
328
  - Use the **left column text** as "title".
@@ -344,6 +346,15 @@ Follow these rules:
344
  "subtopics": [...]
345
  }
346
 
 
 
 
 
 
 
 
 
 
347
  **Examples**:
348
 
349
  - If the image text is `"2 Algebra and functions continued"`, return:
@@ -411,6 +422,8 @@ Follow these rules:
411
 
412
  title = data.get("title", "")
413
  subtopics = data.get("subtopics", [])
 
 
414
  if not isinstance(subtopics, list):
415
  subtopics = []
416
  return {"title": title, "subtopics": subtopics}
@@ -454,21 +467,27 @@ class S3ImageWriter(DataWriter):
454
  for p, info in self.descriptions.items()
455
  }
456
  results = await asyncio.gather(*tasks.values(), return_exceptions=True)
457
- for p, result in zip(tasks.keys(), results):
458
  if isinstance(result, Exception):
459
  logger.error(f"Table classification error for {p}: {result}")
460
  self.descriptions[p]['table_classification'] = "NO_TABLE"
461
  else:
462
  self.descriptions[p]['table_classification'] = result
463
 
464
- for p, info in self.descriptions.items():
 
465
  cls = info['table_classification']
466
  if cls == "TWO_COLUMN":
467
  info['final_alt'] = "HAS TO BE PROCESSED - two column table"
468
  elif cls == "THREE_COLUMN":
469
  info['final_alt'] = "HAS TO BE PROCESSED - three column table"
470
  elif cls == "EMPTY_IMAGE":
 
471
  md_content = md_content.replace(f"![]({key}{p})", "")
 
 
 
 
472
  del self.descriptions[p]
473
  continue
474
  else:
@@ -477,7 +496,7 @@ class S3ImageWriter(DataWriter):
477
 
478
  md_content = await self._process_table_images_in_markdown(key, md_content)
479
 
480
- # Filter final lines to keep only lines with images
481
  final_lines = [
482
  line.strip() for line in md_content.split("\n")
483
  if re.match(r"^\!\[.*\]\(.*\)", line.strip())
@@ -558,12 +577,20 @@ class S3ImageWriter(DataWriter):
558
  with open(cell_path, "rb") as cf:
559
  cell_image_data = cf.read()
560
 
561
- # Save cell image to S3.
562
  cell_key = f"{self.base_path}cells/{os.path.basename(s3_key)}_r{i}_c{j}.png"
563
  self.s3_writer.write(cell_key, cell_image_data)
564
-
 
565
  info = call_gemini_for_subtopic_identification_image(cell_image_data, self.gemini_api_key)
566
- # logger.info(f"Gemini subtopic extraction result for cell {cell_path}: {info}")
 
 
 
 
 
 
 
 
567
 
568
  if info["title"] and not recognized_main_topic:
569
  recognized_main_topic = info["title"]
@@ -706,6 +733,15 @@ In that scenario, your output might look like:
706
  "2.3 A2 Unit 3": [24, 30],
707
  "2.4 A2 Unit 4": [31, 35]
708
  }}
 
 
 
 
 
 
 
 
 
709
  4. Another example might list subtopics:
710
  3.1 Overarching themes 11
711
  3.2 A: Proof 12
@@ -912,7 +948,7 @@ class MineruNoTextProcessor:
912
  subtopic_list = list(writer.extracted_subtopics.values())
913
  subtopic_list = merge_topics(subtopic_list)
914
 
915
- out_path = os.path.join(self.output_folder, "subtopics.json")
916
  with open(out_path, "w", encoding="utf-8") as f:
917
  json.dump(subtopic_list, f, indent=2)
918
  logger.info(f"Final subtopics JSON saved locally at {out_path}")
@@ -925,7 +961,7 @@ class MineruNoTextProcessor:
925
  self.cleanup_gpu()
926
 
927
  if __name__ == "__main__":
928
- input_pdf = "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf"
929
  output_dir = "/home/user/app/pearson_json"
930
  gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
931
  try:
 
207
  logger.error(f"Failed to upload to S3: {str(e)}")
208
  raise
209
 
210
+ def delete(self, path: str) -> None:
211
+ try:
212
+ self.client.delete_object(Bucket=self.bucket, Key=path)
213
+ except Exception as e:
214
+ logger.error(f"Failed to delete from S3: {str(e)}")
215
+ raise
216
+
217
  def preprocess_image(image_data: bytes, max_dim: int = 600, quality: int = 60) -> bytes:
218
  arr = np.frombuffer(image_data, np.uint8)
219
  img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
 
245
  - Two columns
246
  - Headers like 'Subject content', 'Additional information'
247
  - Possibly sections (e.g. 2.1, 3.4, G2, G3, )
 
 
 
 
 
248
  If the image is a relevant table with 2 columns, respond with 'TWO_COLUMN'.
249
  If the image is a relevant table with 3 columns, respond with 'THREE_COLUMN'.
250
  If the image is non-empty but does not show a table, respond with 'NO_TABLE'.
 
311
 
312
  Follow these rules:
313
 
314
+ (1) **If the cell shows a main topic in the format "<number> <Topic Name>",** for example "2 Algebra and functions continued", (remove the word "continued") then:
315
  - Put that text (without the word "continued") in "title". (e.g. "2 Algebra and functions")
316
  - "subtopics" should be an empty array, unless you also see smaller subtopic numbers.
317
 
 
320
  - "title" in this case should be an empty string if you only detect subtopics.
321
  (Example: If text is "2.5 Solve linear inequalities...", then "title" = "", "subtopics" = ["2.5"]).
322
 
323
+ (3) If no main topic or subtopic is detected but the text appears to be a heading (e.g. "Scarcity, choice and opportunity cost"), return:
324
+ {{
325
  "title": "",
326
  "subtopics": []
327
+ }}
328
 
329
  (4) **If there is no numeric value in the left column** (e.g. "2.1" or "2 <Topic name>" not found) but the left column text appears to be a heading (for instance "Scarcity, choice and opportunity cost"), then:
330
  - Use the **left column text** as "title".
 
346
  "subtopics": [...]
347
  }
348
 
349
+ (7) If the image is blank or truncated, defined as:
350
+ - Contains no words at all (e.g. a blank white or black image)
351
+ - Contains only a truncated snippet of words such as "Topics", "What students need to learn" with blue background
352
+ - Contains a truncated snippet with words like "Topics", "What students need to learn", "Content" with gray background (RGB (166,166,166) or (180,180,180)) then return:
353
+ {{
354
+ "title": "EMPTY_IMAGE",
355
+ "subtopics": []
356
+ }}
357
+
358
  **Examples**:
359
 
360
  - If the image text is `"2 Algebra and functions continued"`, return:
 
422
 
423
  title = data.get("title", "")
424
  subtopics = data.get("subtopics", [])
425
+ if title.upper() == "EMPTY_IMAGE":
426
+ return {"title": "EMPTY_IMAGE", "subtopics": []}
427
  if not isinstance(subtopics, list):
428
  subtopics = []
429
  return {"title": title, "subtopics": subtopics}
 
467
  for p, info in self.descriptions.items()
468
  }
469
  results = await asyncio.gather(*tasks.values(), return_exceptions=True)
470
+ for p, result in zip(list(self.descriptions.keys()), results):
471
  if isinstance(result, Exception):
472
  logger.error(f"Table classification error for {p}: {result}")
473
  self.descriptions[p]['table_classification'] = "NO_TABLE"
474
  else:
475
  self.descriptions[p]['table_classification'] = result
476
 
477
+ # Process each image description.
478
+ for p, info in list(self.descriptions.items()):
479
  cls = info['table_classification']
480
  if cls == "TWO_COLUMN":
481
  info['final_alt'] = "HAS TO BE PROCESSED - two column table"
482
  elif cls == "THREE_COLUMN":
483
  info['final_alt'] = "HAS TO BE PROCESSED - three column table"
484
  elif cls == "EMPTY_IMAGE":
485
+ # Remove markdown reference, delete from descriptions and S3.
486
  md_content = md_content.replace(f"![]({key}{p})", "")
487
+ try:
488
+ self.s3_writer.delete(info['s3_path'])
489
+ except Exception as e:
490
+ logger.error(f"Error deleting S3 object {info['s3_path']}: {e}")
491
  del self.descriptions[p]
492
  continue
493
  else:
 
496
 
497
  md_content = await self._process_table_images_in_markdown(key, md_content)
498
 
499
+ # Filter final lines to keep only lines with images.
500
  final_lines = [
501
  line.strip() for line in md_content.split("\n")
502
  if re.match(r"^\!\[.*\]\(.*\)", line.strip())
 
577
  with open(cell_path, "rb") as cf:
578
  cell_image_data = cf.read()
579
 
 
580
  cell_key = f"{self.base_path}cells/{os.path.basename(s3_key)}_r{i}_c{j}.png"
581
  self.s3_writer.write(cell_key, cell_image_data)
582
+
583
+ #extract subtopic info from the cell image.
584
  info = call_gemini_for_subtopic_identification_image(cell_image_data, self.gemini_api_key)
585
+
586
+ # Check if the image is empty.
587
+ if info.get("title", "").upper() == "EMPTY_IMAGE":
588
+ try:
589
+ self.s3_writer.delete(cell_key)
590
+ logger.info(f"Deleted empty cell image from S3: {cell_key}")
591
+ except Exception as e:
592
+ logger.error(f"Error deleting empty cell image {cell_key}: {e}")
593
+ continue # Skip processing this cell further
594
 
595
  if info["title"] and not recognized_main_topic:
596
  recognized_main_topic = info["title"]
 
733
  "2.3 A2 Unit 3": [24, 30],
734
  "2.4 A2 Unit 4": [31, 35]
735
  }}
736
+ or
737
+ 2.1 AS units 6
738
+ 2.2 AS units 23
739
+ In that scenario, your output might look like:
740
+ {{
741
+ "2.1 AS Unit 1": [6, 2],
742
+ "2.2 AS Unit 2": [23, 43]
743
+ }}
744
+
745
  4. Another example might list subtopics:
746
  3.1 Overarching themes 11
747
  3.2 A: Proof 12
 
948
  subtopic_list = list(writer.extracted_subtopics.values())
949
  subtopic_list = merge_topics(subtopic_list)
950
 
951
+ out_path = os.path.join(self.output_folder, "_subtopics.json")
952
  with open(out_path, "w", encoding="utf-8") as f:
953
  json.dump(subtopic_list, f, indent=2)
954
  logger.info(f"Final subtopics JSON saved locally at {out_path}")
 
961
  self.cleanup_gpu()
962
 
963
  if __name__ == "__main__":
964
+ input_pdf = "/home/user/app/input_output/wjec-gce-as-a-economics-specification-from-2015.pdf"
965
  output_dir = "/home/user/app/pearson_json"
966
  gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
967
  try:
topic_extraction.log CHANGED
The diff for this file is too large to render. See raw diff