SkyNait commited on
Commit
f81cfef
·
1 Parent(s): ae8cbf3

enhanced row output filtering

Browse files
__pycache__/inference_svm_model.cpython-310.pyc CHANGED
Binary files a/__pycache__/inference_svm_model.cpython-310.pyc and b/__pycache__/inference_svm_model.cpython-310.pyc differ
 
__pycache__/mineru_single.cpython-310.pyc CHANGED
Binary files a/__pycache__/mineru_single.cpython-310.pyc and b/__pycache__/mineru_single.cpython-310.pyc differ
 
__pycache__/table_row_extraction.cpython-310.pyc CHANGED
Binary files a/__pycache__/table_row_extraction.cpython-310.pyc and b/__pycache__/table_row_extraction.cpython-310.pyc differ
 
__pycache__/topic_extraction.cpython-310.pyc CHANGED
Binary files a/__pycache__/topic_extraction.cpython-310.pyc and b/__pycache__/topic_extraction.cpython-310.pyc differ
 
__pycache__/worker.cpython-310.pyc CHANGED
Binary files a/__pycache__/worker.cpython-310.pyc and b/__pycache__/worker.cpython-310.pyc differ
 
output1.pdf DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:2b2f32c4f39c66673ac775c4061a57259a92b5fc69e81fec46374a9a0eb492b2
3
- size 123145
 
 
 
 
pearson_json/_subtopics.json CHANGED
@@ -1,122 +1,284 @@
1
  [
2
  {
3
- "title": "Content",
4
  "contents": [
5
  {
6
  "type": "image",
7
- "key": "/topic-extraction/cells/img_1.jpg_r0_c0.png"
8
- },
 
 
 
 
 
 
9
  {
10
  "type": "image",
11
  "key": "/topic-extraction/cells/img_2.jpg_r0_c0.png"
12
  },
13
  {
14
  "type": "image",
15
- "key": "/topic-extraction/cells/img_3.jpg_r0_c0.png"
16
- },
 
 
 
 
 
 
17
  {
18
  "type": "image",
19
- "key": "/topic-extraction/cells/img_4.jpg_r1_c0.png"
20
- },
 
 
 
 
 
 
21
  {
22
  "type": "image",
23
- "key": "/topic-extraction/cells/img_5.jpg_r0_c0.png"
24
- },
 
 
 
 
 
 
25
  {
26
  "type": "image",
27
- "key": "/topic-extraction/cells/img_6.jpg_r0_c0.png"
28
- },
 
 
 
 
 
 
29
  {
30
  "type": "image",
31
- "key": "/topic-extraction/cells/img_7.jpg_r0_c0.png"
32
- },
 
 
 
 
 
 
33
  {
34
  "type": "image",
35
- "key": "/topic-extraction/cells/img_8.jpg_r1_c0.png"
36
- },
 
 
 
 
 
 
37
  {
38
  "type": "image",
39
- "key": "/topic-extraction/cells/img_9.jpg_r0_c0.png"
40
- },
 
 
 
 
 
 
41
  {
42
  "type": "image",
43
- "key": "/topic-extraction/cells/img_10.jpg_r0_c0.png"
44
- },
 
 
 
 
 
 
45
  {
46
  "type": "image",
47
- "key": "/topic-extraction/cells/img_11.jpg_r0_c0.png"
48
- },
 
 
 
 
 
 
49
  {
50
  "type": "image",
51
- "key": "/topic-extraction/cells/img_12.jpg_r0_c0.png"
52
- },
 
 
 
 
 
 
53
  {
54
  "type": "image",
55
- "key": "/topic-extraction/cells/img_13.jpg_r0_c1.png"
56
- },
 
 
 
 
 
 
57
  {
58
  "type": "image",
59
- "key": "/topic-extraction/cells/img_14.jpg_r0_c0.png"
60
- },
 
 
 
 
 
 
61
  {
62
  "type": "image",
63
- "key": "/topic-extraction/cells/img_15.jpg_r0_c0.png"
64
- },
 
 
 
 
 
 
65
  {
66
  "type": "image",
67
- "key": "/topic-extraction/cells/img_16.jpg_r0_c0.png"
68
- },
 
 
 
 
 
 
69
  {
70
  "type": "image",
71
- "key": "/topic-extraction/cells/img_17.jpg_r1_c0.png"
72
- },
 
 
 
 
 
 
73
  {
74
  "type": "image",
75
- "key": "/topic-extraction/cells/img_18.jpg_r0_c0.png"
76
- },
 
 
 
 
 
 
77
  {
78
  "type": "image",
79
- "key": "/topic-extraction/cells/img_19.jpg_r0_c0.png"
80
- },
 
 
 
 
 
 
81
  {
82
  "type": "image",
83
- "key": "/topic-extraction/cells/img_20.jpg_r0_c0.png"
84
- },
 
 
 
 
 
 
85
  {
86
  "type": "image",
87
- "key": "/topic-extraction/cells/img_21.jpg_r0_c0.png"
88
- },
 
 
 
 
 
 
89
  {
90
  "type": "image",
91
- "key": "/topic-extraction/cells/img_22.jpg_r0_c0.png"
92
- },
 
 
 
 
 
 
93
  {
94
  "type": "image",
95
- "key": "/topic-extraction/cells/img_23.jpg_r0_c0.png"
96
- },
 
 
 
 
 
 
97
  {
98
  "type": "image",
99
- "key": "/topic-extraction/cells/img_24.jpg_r0_c0.png"
100
- },
 
 
 
 
 
 
101
  {
102
  "type": "image",
103
- "key": "/topic-extraction/cells/img_25.jpg_r1_c0.png"
104
- },
 
 
 
 
 
 
105
  {
106
  "type": "image",
107
- "key": "/topic-extraction/cells/img_26.jpg_r0_c0.png"
108
- },
 
 
 
 
 
 
109
  {
110
  "type": "image",
111
- "key": "/topic-extraction/cells/img_27.jpg_r0_c0.png"
112
- },
 
 
 
 
 
 
113
  {
114
  "type": "image",
115
- "key": "/topic-extraction/cells/img_28.jpg_r0_c0.png"
116
- },
 
 
 
 
 
 
117
  {
118
  "type": "image",
119
- "key": "/topic-extraction/cells/img_29.jpg_r0_c0.png"
120
  }
121
  ],
122
  "children": []
 
1
  [
2
  {
3
+ "title": "Scarcity, choice and opportunity cost",
4
  "contents": [
5
  {
6
  "type": "image",
7
+ "key": "/topic-extraction/cells/img_1.jpg_r1_c0.png"
8
+ }
9
+ ],
10
+ "children": []
11
+ },
12
+ {
13
+ "title": "Content Amplification Additional guidance notes",
14
+ "contents": [
15
  {
16
  "type": "image",
17
  "key": "/topic-extraction/cells/img_2.jpg_r0_c0.png"
18
  },
19
  {
20
  "type": "image",
21
+ "key": "/topic-extraction/cells/img_18.jpg_r0_c0.png"
22
+ }
23
+ ],
24
+ "children": []
25
+ },
26
+ {
27
+ "title": "Price, income and cross price elasticities of demand, price elasticity of supply",
28
+ "contents": [
29
  {
30
  "type": "image",
31
+ "key": "/topic-extraction/cells/img_3.jpg_r1_c0.png"
32
+ }
33
+ ],
34
+ "children": []
35
+ },
36
+ {
37
+ "title": "Wage determination",
38
+ "contents": [
39
  {
40
  "type": "image",
41
+ "key": "/topic-extraction/cells/img_4.jpg_r2_c0.png"
42
+ }
43
+ ],
44
+ "children": []
45
+ },
46
+ {
47
+ "title": "How resources are allocated in a free market economy",
48
+ "contents": [
49
  {
50
  "type": "image",
51
+ "key": "/topic-extraction/cells/img_5.jpg_r1_c0.png"
52
+ }
53
+ ],
54
+ "children": []
55
+ },
56
+ {
57
+ "title": "Understanding market failure",
58
+ "contents": [
59
  {
60
  "type": "image",
61
+ "key": "/topic-extraction/cells/img_6.jpg_r1_c0.png"
62
+ }
63
+ ],
64
+ "children": []
65
+ },
66
+ {
67
+ "title": "Why and how governments intervene in markets",
68
+ "contents": [
69
  {
70
  "type": "image",
71
+ "key": "/topic-extraction/cells/img_7.jpg_r1_c0.png"
72
+ }
73
+ ],
74
+ "children": []
75
+ },
76
+ {
77
+ "title": "The circular flow of income model",
78
+ "contents": [
79
  {
80
  "type": "image",
81
+ "key": "/topic-extraction/cells/img_8.jpg_r2_c0.png"
82
+ }
83
+ ],
84
+ "children": []
85
+ },
86
+ {
87
+ "title": "The AD function",
88
+ "contents": [
89
  {
90
  "type": "image",
91
+ "key": "/topic-extraction/cells/img_9.jpg_r1_c0.png"
92
+ }
93
+ ],
94
+ "children": []
95
+ },
96
+ {
97
+ "title": "Government policy objectives",
98
+ "contents": [
99
  {
100
  "type": "image",
101
+ "key": "/topic-extraction/cells/img_10.jpg_r1_c0.png"
102
+ }
103
+ ],
104
+ "children": []
105
+ },
106
+ {
107
+ "title": "Fiscal policy",
108
+ "contents": [
109
  {
110
  "type": "image",
111
+ "key": "/topic-extraction/cells/img_11.jpg_r1_c0.png"
112
+ }
113
+ ],
114
+ "children": []
115
+ },
116
+ {
117
+ "title": "Monetary policy",
118
+ "contents": [
119
  {
120
  "type": "image",
121
+ "key": "/topic-extraction/cells/img_12.jpg_r1_c0.png"
122
+ }
123
+ ],
124
+ "children": []
125
+ },
126
+ {
127
+ "title": "Exchange rates and exchange rate policy",
128
+ "contents": [
129
  {
130
  "type": "image",
131
+ "key": "/topic-extraction/cells/img_13.jpg_r1_c0.png"
132
+ }
133
+ ],
134
+ "children": []
135
+ },
136
+ {
137
+ "title": "Free trade and protectionism",
138
+ "contents": [
139
  {
140
  "type": "image",
141
+ "key": "/topic-extraction/cells/img_14.jpg_r1_c0.png"
142
+ }
143
+ ],
144
+ "children": []
145
+ },
146
+ {
147
+ "title": "Costs, revenues and profits",
148
+ "contents": [
149
  {
150
  "type": "image",
151
+ "key": "/topic-extraction/cells/img_15.jpg_r1_c0.png"
152
+ }
153
+ ],
154
+ "children": []
155
+ },
156
+ {
157
+ "title": "Background to market structures",
158
+ "contents": [
159
  {
160
  "type": "image",
161
+ "key": "/topic-extraction/cells/img_16.jpg_r1_c0.png"
162
+ }
163
+ ],
164
+ "children": []
165
+ },
166
+ {
167
+ "title": "Monopoly",
168
+ "contents": [
169
  {
170
  "type": "image",
171
+ "key": "/topic-extraction/cells/img_17.jpg_r2_c0.png"
172
+ }
173
+ ],
174
+ "children": []
175
+ },
176
+ {
177
+ "title": "Short run aggregate supply (SRAS)",
178
+ "contents": [
179
  {
180
  "type": "image",
181
+ "key": "/topic-extraction/cells/img_19.jpg_r1_c0.png"
182
+ }
183
+ ],
184
+ "children": []
185
+ },
186
+ {
187
+ "title": "The short run Phillips curve",
188
+ "contents": [
189
  {
190
  "type": "image",
191
+ "key": "/topic-extraction/cells/img_20.jpg_r1_c0.png"
192
+ }
193
+ ],
194
+ "children": []
195
+ },
196
+ {
197
+ "title": "Economic growth",
198
+ "contents": [
199
  {
200
  "type": "image",
201
+ "key": "/topic-extraction/cells/img_21.jpg_r1_c0.png"
202
+ }
203
+ ],
204
+ "children": []
205
+ },
206
+ {
207
+ "title": "Unemployment",
208
+ "contents": [
209
  {
210
  "type": "image",
211
+ "key": "/topic-extraction/cells/img_22.jpg_r1_c0.png"
212
+ }
213
+ ],
214
+ "children": []
215
+ },
216
+ {
217
+ "title": "Solutions",
218
+ "contents": [
219
  {
220
  "type": "image",
221
+ "key": "/topic-extraction/cells/img_23.jpg_r1_c0.png"
222
+ }
223
+ ],
224
+ "children": []
225
+ },
226
+ {
227
+ "title": "Inflation and deflation",
228
+ "contents": [
229
  {
230
  "type": "image",
231
+ "key": "/topic-extraction/cells/img_24.jpg_r1_c0.png"
232
+ }
233
+ ],
234
+ "children": []
235
+ },
236
+ {
237
+ "title": "The balance of payments",
238
+ "contents": [
239
  {
240
  "type": "image",
241
+ "key": "/topic-extraction/cells/img_25.jpg_r2_c0.png"
242
+ }
243
+ ],
244
+ "children": []
245
+ },
246
+ {
247
+ "title": "Control of the national (public sector) debt",
248
+ "contents": [
249
  {
250
  "type": "image",
251
+ "key": "/topic-extraction/cells/img_26.jpg_r1_c0.png"
252
+ }
253
+ ],
254
+ "children": []
255
+ },
256
+ {
257
+ "title": "The operation of monetary policy and monetary stability",
258
+ "contents": [
259
  {
260
  "type": "image",
261
+ "key": "/topic-extraction/cells/img_27.jpg_r1_c0.png"
262
+ }
263
+ ],
264
+ "children": []
265
+ },
266
+ {
267
+ "title": "Advantages and disadvantages of free trade",
268
+ "contents": [
269
  {
270
  "type": "image",
271
+ "key": "/topic-extraction/cells/img_28.jpg_r1_c0.png"
272
+ }
273
+ ],
274
+ "children": []
275
+ },
276
+ {
277
+ "title": "European Union",
278
+ "contents": [
279
  {
280
  "type": "image",
281
+ "key": "/topic-extraction/cells/img_29.jpg_r1_c0.png"
282
  }
283
  ],
284
  "children": []
topic_extr.py ADDED
@@ -0,0 +1,989 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import os
3
+ import re
4
+ import gc
5
+ import json
6
+ import logging
7
+ import fitz
8
+ import boto3
9
+ import base64
10
+ import time
11
+ import asyncio
12
+ import tempfile
13
+ import requests
14
+ from io import BytesIO
15
+ from typing import List, Dict, Any
16
+
17
+ import torch
18
+ import cv2
19
+ import numpy as np
20
+
21
+ from google import genai
22
+ from google.genai import types
23
+
24
+ from magic_pdf.data.dataset import PymuDocDataset
25
+ from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
26
+ from magic_pdf.data.data_reader_writer.base import DataWriter
27
+ from table_row_extraction import TableExtractor
28
+
29
+ logging.basicConfig(level=logging.INFO)
30
+ logger = logging.getLogger(__name__)
31
+ logger.setLevel(logging.INFO)
32
+ file_handler = logging.FileHandler("topic_extraction.log")
33
+ file_handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(name)s - %(message)s"))
34
+ logger.addHandler(file_handler)
35
+
36
+ _GEMINI_CLIENT = None
37
+
38
+ # helper functions, also global
39
+ def unify_whitespace(text: str) -> str:
40
+ return re.sub(r"\s+", " ", text).strip()
41
+
42
+ def find_all_occurrences(pdf_bytes: bytes, search_text: str) -> List[int]:
43
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
44
+ st_norm = unify_whitespace(search_text)
45
+ found = []
46
+ for i in range(doc.page_count):
47
+ raw = doc[i].get_text("raw")
48
+ norm = unify_whitespace(raw)
49
+ if st_norm in norm:
50
+ found.append(i)
51
+ doc.close()
52
+ return sorted(found)
53
+
54
+ def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> bytes:
55
+ if not page_indices:
56
+ raise ValueError("No page indices provided for subset creation.")
57
+ doc = fitz.open(stream=original_pdf_bytes, filetype="pdf")
58
+ new_doc = fitz.open()
59
+ for p in sorted(set(page_indices)):
60
+ if 0 <= p < doc.page_count:
61
+ new_doc.insert_pdf(doc, from_page=p, to_page=p)
62
+ else:
63
+ logger.error(f"Page index {p} out of range (0..{doc.page_count - 1}).")
64
+ raise ValueError(f"Page index {p} out of range.")
65
+ subset_bytes = new_doc.tobytes()
66
+ new_doc.close()
67
+ doc.close()
68
+ return subset_bytes
69
+
70
+ def unify_topic_name(raw_title: str, children_subtopics: list) -> str:
71
+ """
72
+ Clean up a topic title:
73
+ - Remove any trailing "continued".
74
+ - If the title does not start with a number but children provide a consistent numeric prefix,
75
+ then prepend that prefix.
76
+ """
77
+ title = raw_title.strip()
78
+ # Remove trailing "continued"
79
+ title = re.sub(r"\s+continued\s*$", "", title, flags=re.IGNORECASE)
80
+
81
+ # If title already starts with a number, use it as is.
82
+ if re.match(r"^\d+", title):
83
+ return title
84
+
85
+ # Otherwise, try to deduce a numeric prefix from the children.
86
+ prefixes = []
87
+ for child in children_subtopics:
88
+ child_title = child.get("title", "").strip()
89
+ m = re.match(r"^(\d+)\.", child_title)
90
+ if m:
91
+ prefixes.append(m.group(1))
92
+ if prefixes:
93
+ # If all numeric prefixes in children are the same, use that prefix.
94
+ if all(p == prefixes[0] for p in prefixes):
95
+ # If title is non-empty, prepend the number; otherwise, use a fallback.
96
+ if title:
97
+ title = f"{prefixes[0]} {title}"
98
+ else:
99
+ title = f"{prefixes[0]} Topic"
100
+ # Optionally, handle known broken titles explicitly.
101
+ if title.lower() in {"gonometry"}:
102
+ # For example, if children indicate "5.X", set to "5 Trigonometry"
103
+ if prefixes and prefixes[0] == "5":
104
+ title = "5 Trigonometry"
105
+ return title
106
+
107
+ def merge_topics(subtopic_list: list) -> list:
108
+ """
109
+ Merge topics with an enhanced logic:
110
+ 1. Clean up each topic's title using unify_topic_name.
111
+ 2. Group topics by the parent's numeric prefix (if available). Topics without a numeric prefix use their title.
112
+ 3. Reassign children: for each child whose title (e.g. "2.1") does not match its current parent's numeric prefix,
113
+ move it to the parent with the matching prefix if available.
114
+ 4. Remove duplicate children by merging contents.
115
+ 5. Sort parent topics and each parent's children by their numeric ordering.
116
+ """
117
+ # First, merge topics by parent's numeric prefix.
118
+ merged = {}
119
+ for topic_obj in subtopic_list:
120
+ raw_title = topic_obj.get("title", "")
121
+ children = topic_obj.get("children", [])
122
+ contents = topic_obj.get("contents", [])
123
+ new_title = unify_topic_name(raw_title, children)
124
+ # Extract parent's numeric prefix, if present.
125
+ m = re.match(r"^(\d+)", new_title)
126
+ parent_prefix = m.group(1) if m else None
127
+ key = parent_prefix if parent_prefix is not None else new_title
128
+
129
+ if key not in merged:
130
+ merged[key] = {
131
+ "title": new_title,
132
+ "contents": list(contents),
133
+ "children": list(children),
134
+ }
135
+ else:
136
+ # Merge contents and children; choose the longer title.
137
+ if len(new_title) > len(merged[key]["title"]):
138
+ merged[key]["title"] = new_title
139
+ merged[key]["contents"].extend(contents)
140
+ merged[key]["children"].extend(children)
141
+
142
+ # Build a lookup of merged topics by their numeric prefix.
143
+ parent_lookup = merged # keys are numeric prefixes or the full title for non-numeric ones.
144
+
145
+ # Reassign children to the correct parent based on their numeric prefix.
146
+ for key, topic in merged.items():
147
+ new_children = []
148
+ for child in topic["children"]:
149
+ child_title = child.get("title", "").strip()
150
+ m_child = re.match(r"^(\d+)\.", child_title)
151
+ if m_child:
152
+ child_prefix = m_child.group(1)
153
+ if key != child_prefix and child_prefix in parent_lookup:
154
+ # Reassign this child to the proper parent.
155
+ parent_lookup[child_prefix]["children"].append(child)
156
+ continue
157
+ new_children.append(child)
158
+ topic["children"] = new_children
159
+
160
+ # Remove duplicate children by merging their contents.
161
+ for topic in merged.values():
162
+ child_map = {}
163
+ for child in topic["children"]:
164
+ ctitle = child.get("title", "").strip()
165
+ if ctitle not in child_map:
166
+ child_map[ctitle] = child
167
+ else:
168
+ child_map[ctitle]["contents"].extend(child.get("contents", []))
169
+ child_map[ctitle]["children"].extend(child.get("children", []))
170
+ topic["children"] = list(child_map.values())
171
+
172
+ # Sort children by full numeric order (e.g. "2.1" < "2.10" < "2.2").
173
+ def parse_subtopic_num(subtitle):
174
+ digits = re.findall(r"\d+", subtitle)
175
+ return tuple(int(d) for d in digits) if digits else (9999,)
176
+ topic["children"].sort(key=lambda ch: parse_subtopic_num(ch.get("title", "")))
177
+
178
+ # Convert merged topics to a sorted list.
179
+ def parse_parent_num(topic):
180
+ m = re.match(r"^(\d+)", topic.get("title", ""))
181
+ return int(m.group(1)) if m else 9999
182
+ final_list = list(merged.values())
183
+ final_list.sort(key=lambda topic: parse_parent_num(topic))
184
+ return final_list
185
+
186
+ class s3Writer:
187
+ def __init__(self, ak: str, sk: str, bucket: str, endpoint_url: str):
188
+ self.bucket = bucket
189
+ self.client = boto3.client(
190
+ 's3',
191
+ aws_access_key_id=ak,
192
+ aws_secret_access_key=sk,
193
+ endpoint_url=endpoint_url
194
+ )
195
+
196
+ def write(self, path: str, data: bytes) -> None:
197
+ try:
198
+ file_obj = BytesIO(data)
199
+ self.client.upload_fileobj(
200
+ file_obj,
201
+ self.bucket,
202
+ path
203
+ )
204
+ logger.info(f"Uploaded to S3: {path}")
205
+ except Exception as e:
206
+ logger.error(f"Failed to upload to S3: {str(e)}")
207
+ raise
208
+
209
+ def delete(self, path: str) -> None:
210
+ try:
211
+ self.client.delete_object(Bucket=self.bucket, Key=path)
212
+ except Exception as e:
213
+ logger.error(f"Failed to delete from S3: {str(e)}")
214
+ raise
215
+
216
+ def preprocess_image(image_data: bytes, max_dim: int = 600, quality: int = 60) -> bytes:
217
+ arr = np.frombuffer(image_data, np.uint8)
218
+ img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
219
+ if img is not None:
220
+ h, w, _ = img.shape
221
+ if max(h, w) > max_dim:
222
+ scale = max_dim / float(max(h, w))
223
+ new_w = int(w * scale)
224
+ new_h = int(h * scale)
225
+ img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
226
+ encode_params = [int(cv2.IMWRITE_JPEG_QUALITY), quality]
227
+ success, enc = cv2.imencode(".jpg", img, encode_params)
228
+ if success:
229
+ return enc.tobytes()
230
+ return image_data
231
+
232
+ def call_gemini_for_table_classification(image_data: bytes, api_key: str, max_retries: int = 1) -> str:
233
+ """
234
+ Existing Gemini call to classify an image as TWO_COLUMN, THREE_COLUMN, or NO_TABLE.
235
+ """
236
+ for attempt in range(max_retries + 1):
237
+ try:
238
+ prompt = """You are given an image. Determine if it shows a table that has exactly 2 or 3 columns.
239
+ The three-column 'table' image includes such key features:
240
+ - Three columns header
241
+ - Headers like 'Topics', 'Content', 'Guidelines', 'Amplification', 'Additional guidance notes', 'Area of Study'
242
+ - Possibly sections (e.g. 8.4, 9.1)
243
+ The two-column 'table' image includes such key features:
244
+ - Two columns
245
+ - Headers like 'Subject content', 'Additional information'
246
+ - Possibly sections (e.g. 2.1, 3.4, G2, G3, )
247
+ If the image is a relevant table with 2 columns, respond with 'TWO_COLUMN'.
248
+ If the image is a relevant table with 3 columns, respond with 'THREE_COLUMN'.
249
+ If the image is non-empty but does not show a table, respond with 'NO_TABLE'.
250
+ Return only one of these exact labels.
251
+ """
252
+ global _GEMINI_CLIENT
253
+ if _GEMINI_CLIENT is None:
254
+ _GEMINI_CLIENT = genai.Client(api_key=api_key)
255
+ client = _GEMINI_CLIENT
256
+
257
+ resp = client.models.generate_content(
258
+ model="gemini-2.0-flash",
259
+ contents=[
260
+ {
261
+ "parts": [
262
+ {"text": prompt},
263
+ {
264
+ "inline_data": {
265
+ "mime_type": "image/jpeg",
266
+ "data": base64.b64encode(image_data).decode('utf-8')
267
+ }
268
+ }
269
+ ]
270
+ }
271
+ ],
272
+ config=types.GenerateContentConfig(temperature=0.0)
273
+ )
274
+ if resp and resp.text:
275
+ classification = resp.text.strip().upper()
276
+ if "THREE" in classification:
277
+ return "THREE_COLUMN"
278
+ elif "TWO" in classification:
279
+ return "TWO_COLUMN"
280
+ elif "EMPTY" in classification:
281
+ return "EMPTY_IMAGE"
282
+ return "NO_TABLE"
283
+ except Exception as e:
284
+ logger.error(f"Gemini table classification error: {e}")
285
+ if "503" in str(e):
286
+ return "NO_TABLE"
287
+ if attempt < max_retries:
288
+ time.sleep(0.5)
289
+ else:
290
+ return "NO_TABLE"
291
+
292
+ async def classify_image_async(image_data: bytes, api_key: str, max_retries: int = 1) -> str:
293
+ loop = asyncio.get_event_loop()
294
+ preprocessed = preprocess_image(image_data)
295
+ return await loop.run_in_executor(None, call_gemini_for_table_classification, preprocessed, api_key, max_retries)
296
+
297
+ def call_gemini_for_subtopic_identification_image(image_data: bytes, api_key: str, max_retries: int = 1) -> dict:
298
+ for attempt in range(max_retries + 1):
299
+ try:
300
+ prompt = """
301
+ You are given an image from an educational curriculum specification for Gemini Flash 2. The image may contain:
302
+ 1) A main topic heading in the format: "<number> <Topic Name>", for example "2 Algebra and functions continued".
303
+ 2) A subtopic heading in the format "<number>.<number>" or "<number>.<number>.<number>", for example "2.5", "2.6", "3.4", "2.1.1", "4.3.3" or "1.2.1".
304
+ 3) A label-like title in the left column of a two-column table, for example "G2", "G3", "Scarcity, choice and opportunity cost", or similar text without explicit numeric patterns (2.1, 3.4, etc.).
305
+ 4) Possibly no relevant text or only truncated text (e.g. "Topics", "Subject content", "What students need to learn", "Content Amplification Additional guidance notes", etc.).
306
+
307
+ Your task is to extract:
308
+ - **"title"**: A recognized main topic or heading text.
309
+ - **"subtopics"**: Any recognized subtopic numbers (e.g. "2.5", "2.6", "3.4", "G2", "2.1.1", "4.1.1"), as an array of strings.
310
+
311
+ Follow these rules:
312
+
313
+ (1) **If the cell shows a main topic in the format "<number> <Topic Name>",** for example "2 Algebra and functions continued":
314
+ - Remove the word "continued" if present.
315
+ - Put that resulting text in "title". (e.g. "2 Algebra and functions")
316
+ - "subtopics" should be an empty array, unless smaller subtopic numbers (e.g. "2.5") are also detected in the same text.
317
+
318
+ (2) **If the cell shows one or more subtopic numbers** in the format "<number>.<number>", for example "2.5", "2.6", or "3.4":
319
+ - Collect those exact strings in the JSON key "subtopics" (an array of strings).
320
+ - "title" in this case should be an empty string if you only detect subtopics.
321
+ (Example: If text is "2.5 Solve linear inequalities...", then "title" = "", "subtopics" = ["2.5"]).
322
+
323
+ (3) **If no main topic or subtopic is detected but the text appears to be a heading**, for example "Specialisation, division of labour and exchange", then:
324
+ - Return:
325
+ {
326
+ "title": "<the heading text>",
327
+ "subtopics": []
328
+ }
329
+
330
+ (4) **If there is no numeric value in the left column** (e.g. "2.1" or "2 <Topic name>" not found) but the left column text appears to be a heading (for instance "Scarcity, choice and opportunity cost"), then:
331
+ - Use that left column text as "title".
332
+ - "subtopics" remains empty.
333
+ Example:
334
+ If the left column is "Scarcity, choice and opportunity cost" and the right column has definitions, your output is:
335
+ {
336
+ "title": "Scarcity, choice and opportunity cost",
337
+ "subtopics": []
338
+ }
339
+
340
+ (5) **If there is no numeric value in the left column** (e.g. "2.1" or "2 <Topic name>" not found) or it appears to be a standalone column with text, treat it as a heading.
341
+ - "subtopics" remains empty.
342
+ Example:
343
+ If there is only one column image that is "Specialisation, devision of labour and exchange" and the right column is not present, your output is:
344
+ {
345
+ "title": "Specialisation, devision of labour and exchange",
346
+ "subtopics": []
347
+ }
348
+
349
+ (6) **If there is a character + digit pattern** in the left column of a two-column table (for example "G2", "G3", "G4", "C1"), treat that as a topic-like label:
350
+ - Put that label text into "title" (e.g. "G2").
351
+ - "subtopics" remains empty unless you also see actual subtopic formats like "2.5", "3.4" inside the same cell.
352
+
353
+ (7) **Output must be valid JSON** in this exact structure, with no extra text or explanation:
354
+ {
355
+ "title": "...",
356
+ "subtopics": [...]
357
+ }
358
+
359
+ (8) **If the image is blank or truncated**, defined as:
360
+ - Contains no words at all (e.g. a blank white or black image), **OR**
361
+ - Contains only snippet words/phrases such as "Topics", "Subject content", "Content Amplification Additional guidance notes", "What students need to learn" (including variations in background color), **OR**
362
+ - Contains partial headings with no recognizable numeric or textual headings
363
+ - Contains partial UI labels only, such as “Topics” in a gray bar or “What students need to learn” in a blue bar, with no additional meaningful text.
364
+ then return:
365
+ {
366
+ "title": "EMPTY_IMAGE",
367
+ "subtopics": []
368
+ }
369
+
370
+ (9) **If you cannot recognize any text matching the patterns above**, or the text is too partial/truncated to form a valid heading, also return:
371
+ {
372
+ "title": "EMPTY_IMAGE",
373
+ "subtopics": []
374
+ }
375
+
376
+ **Examples**:
377
+
378
+ - If the image text is "2 Algebra and functions continued", return:
379
+ {
380
+ "title": "2 Algebra and functions",
381
+ "subtopics": []
382
+ }
383
+
384
+ - If the image text is "2.5 Solve linear and quadratic inequalities ...", return:
385
+ {
386
+ "title": "",
387
+ "subtopics": ["2.5"]
388
+ }
389
+
390
+ - If the image text is "Specialisation, division of labour and exchange" (with no numeric patterns at all), return:
391
+ {
392
+ "title": "Specialisation, division of labour and exchange",
393
+ "subtopics": []
394
+ }
395
+
396
+ - If the left column says "G2" and the right column has details, but no subtopic numbers, return:
397
+ {
398
+ "title": "G2",
399
+ "subtopics": []
400
+ }
401
+
402
+ - If the image is blank or shows only partial/truncated snippet words (e.g. "Topics", "Content Amplification Additional guidance notes", "Subject content", "What students need to learn") and nothing else, return:
403
+ {
404
+ "title": "EMPTY_IMAGE",
405
+ "subtopics": []
406
+ }
407
+ """
408
+ global _GEMINI_CLIENT
409
+ if _GEMINI_CLIENT is None:
410
+ _GEMINI_CLIENT = genai.Client(api_key=api_key)
411
+ client = _GEMINI_CLIENT
412
+
413
+ resp = client.models.generate_content(
414
+ model="gemini-2.0-flash",
415
+ contents=[
416
+ {
417
+ "parts": [
418
+ {"text": prompt},
419
+ {
420
+ "inline_data": {
421
+ "mime_type": "image/jpeg",
422
+ "data": base64.b64encode(image_data).decode("utf-8")
423
+ }
424
+ }
425
+ ]
426
+ }
427
+ ],
428
+ config=types.GenerateContentConfig(temperature=0.0)
429
+ )
430
+
431
+ if not resp or not resp.text:
432
+ logger.warning("Gemini returned an empty response for subtopic extraction.")
433
+ return {"title": "", "subtopics": []}
434
+
435
+ raw = resp.text.strip()
436
+ # Remove any markdown fences if present
437
+ raw = raw.replace("```json", "").replace("```", "").strip()
438
+ data = json.loads(raw)
439
+
440
+ title = data.get("title", "")
441
+ subtopics = data.get("subtopics", [])
442
+ if title.upper() == "EMPTY_IMAGE":
443
+ return {"title": "EMPTY_IMAGE", "subtopics": []}
444
+ if not isinstance(subtopics, list):
445
+ subtopics = []
446
+ return {"title": title, "subtopics": subtopics}
447
+
448
+ except Exception as e:
449
+ logger.error(f"Gemini subtopic identification error on attempt {attempt}: {e}")
450
+ if attempt < max_retries:
451
+ time.sleep(0.5)
452
+ else:
453
+ return {"title": "", "subtopics": []}
454
+
455
+ return {"title": "", "subtopics": []}
456
+
457
+ class S3ImageWriter(DataWriter):
458
+ def __init__(self, s3_writer: s3Writer, base_path: str, gemini_api_key: str):
459
+ self.s3_writer = s3_writer
460
+ self.base_path = base_path if base_path.endswith("/") else base_path + "/"
461
+ self.gemini_api_key = gemini_api_key
462
+ self.descriptions = {}
463
+ self._img_count = 0
464
+ self.extracted_tables = {}
465
+
466
+ self.extracted_subtopics = {}
467
+
468
+ def write(self, path: str, data: bytes) -> None:
469
+ self._img_count += 1
470
+ unique_id = f"img_{self._img_count}.jpg"
471
+ s3_key = f"{self.base_path}{unique_id}"
472
+ self.s3_writer.write(s3_key, data)
473
+ self.descriptions[path] = {
474
+ "data": data,
475
+ "s3_path": s3_key,
476
+ "table_classification": "NO_TABLE",
477
+ "final_alt": ""
478
+ }
479
+
480
+ async def post_process_async(self, key: str, md_content: str) -> str:
481
+ logger.info("Classifying images to detect tables.")
482
+ tasks = {
483
+ p: asyncio.create_task(classify_image_async(info["data"], self.gemini_api_key))
484
+ for p, info in self.descriptions.items()
485
+ }
486
+ results = await asyncio.gather(*tasks.values(), return_exceptions=True)
487
+ for p, result in zip(list(self.descriptions.keys()), results):
488
+ if isinstance(result, Exception):
489
+ logger.error(f"Table classification error for {p}: {result}")
490
+ self.descriptions[p]['table_classification'] = "NO_TABLE"
491
+ else:
492
+ self.descriptions[p]['table_classification'] = result
493
+
494
+ # Process each image description.
495
+ for p, info in list(self.descriptions.items()):
496
+ cls = info['table_classification']
497
+ if cls == "TWO_COLUMN":
498
+ info['final_alt'] = "HAS TO BE PROCESSED - two column table"
499
+ elif cls == "THREE_COLUMN":
500
+ info['final_alt'] = "HAS TO BE PROCESSED - three column table"
501
+ elif cls == "EMPTY_IMAGE":
502
+ md_content = md_content.replace(f"![]({key}{p})", "")
503
+ try:
504
+ self.s3_writer.delete(info['s3_path'])
505
+ except Exception as e:
506
+ logger.error(f"Error deleting S3 object {info['s3_path']}: {e}")
507
+ del self.descriptions[p]
508
+ continue
509
+ else:
510
+ info['final_alt'] = "NO_TABLE image"
511
+ md_content = md_content.replace(f"![]({key}{p})", f"![{info['final_alt']}]({info['s3_path']})")
512
+
513
+ md_content = await self._process_table_images_in_markdown(key, md_content)
514
+
515
+ # Filter final lines to keep only lines with images.
516
+ final_lines = [
517
+ line.strip() for line in md_content.split("\n")
518
+ if re.match(r"^\!\[.*\]\(.*\)", line.strip())
519
+ ]
520
+ return "\n".join(final_lines)
521
+
522
+ async def _process_table_images_in_markdown(self, key: str, md_content: str) -> str:
523
+ pat = r"!\[HAS TO BE PROCESSED - (two|three) column table\]\(([^)]+)\)"
524
+ matches = re.findall(pat, md_content, flags=re.IGNORECASE)
525
+ if not matches:
526
+ return md_content
527
+
528
+ for (col_type, s3_key) in matches:
529
+ logger.info(f"Processing table image: {s3_key}, columns={col_type}")
530
+ img_data = None
531
+ for desc in self.descriptions.values():
532
+ if desc.get("s3_path") == s3_key:
533
+ img_data = desc.get("data")
534
+ break
535
+ if img_data is None:
536
+ logger.warning(f"No image data found for S3 key {s3_key}. Skipping.")
537
+ continue
538
+
539
+ # Write temporary file for processing.
540
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file:
541
+ temp_file.write(img_data)
542
+ temp_path = temp_file.name
543
+
544
+ try:
545
+ if col_type.lower() == 'two':
546
+ extractor = TableExtractor(
547
+ skip_header=True,
548
+ merge_two_col_rows=True,
549
+ enable_subtopic_merge=True,
550
+ subtopic_threshold=0.2
551
+ )
552
+ else:
553
+ extractor = TableExtractor(
554
+ skip_header=True,
555
+ merge_two_col_rows=False,
556
+ enable_subtopic_merge=False,
557
+ subtopic_threshold=0.2
558
+ )
559
+ row_boxes = extractor.process_image(temp_path)
560
+ out_folder = temp_path + "_rows"
561
+ os.makedirs(out_folder, exist_ok=True)
562
+ extractor.save_extracted_cells(temp_path, row_boxes, out_folder)
563
+
564
+ #Group cells by row using file name pattern
565
+ recognized_main_topic = ""
566
+ main_topic_image_key = None
567
+ recognized_subtopics = []
568
+ header_found = False
569
+ header_row_index = None
570
+
571
+ # Loop through each row of extracted cells
572
+ for i, row in enumerate(row_boxes):
573
+ row_dir = os.path.join(out_folder, f"row_{i}")
574
+ valid_info = None
575
+ valid_cell_key = None
576
+ for j in range(len(row)):
577
+ cell_path = os.path.join(row_dir, f"col_{j}.png")
578
+ if not os.path.isfile(cell_path):
579
+ alternative_path = os.path.join(row_dir, f"col_{j}.jpg")
580
+ if os.path.isfile(alternative_path):
581
+ cell_path = alternative_path
582
+ else:
583
+ logger.warning(f"Cell image not found: {cell_path}")
584
+ continue
585
+ with open(cell_path, "rb") as cf:
586
+ cell_image_data = cf.read()
587
+ cell_key = f"{self.base_path}cells/{os.path.basename(s3_key)}_r{i}_c{j}.png"
588
+ self.s3_writer.write(cell_key, cell_image_data)
589
+ info = call_gemini_for_subtopic_identification_image(cell_image_data, self.gemini_api_key)
590
+ if info.get("title", "").upper() == "EMPTY_IMAGE":
591
+ try:
592
+ self.s3_writer.delete(cell_key)
593
+ logger.info(f"Deleted empty cell image from S3: {cell_key}")
594
+ except Exception as e:
595
+ logger.error(f"Error deleting empty cell image {cell_key}: {e}")
596
+ continue
597
+ valid_info = info
598
+ valid_cell_key = cell_key
599
+ break # Use only the first valid cell in this row
600
+
601
+ if valid_info is None:
602
+ continue
603
+
604
+ # First valid row becomes header row.
605
+ if not header_found:
606
+ header_found = True
607
+ header_row_index = i
608
+ recognized_main_topic = valid_info.get("title", "")
609
+ main_topic_image_key = valid_cell_key
610
+ # The row immediately following the header is used for subtopic children.
611
+ elif i == header_row_index + 1:
612
+ for st in valid_info.get("subtopics", []):
613
+ recognized_subtopics.append({
614
+ "title": st,
615
+ "contents": [{"type": "image", "key": valid_cell_key}],
616
+ "children": []
617
+ })
618
+ else:
619
+ # Ignore further rows
620
+ continue
621
+
622
+ final_json = {
623
+ "title": recognized_main_topic,
624
+ "contents": [],
625
+ "children": recognized_subtopics
626
+ }
627
+ if main_topic_image_key:
628
+ final_json["contents"].append({"type": "image", "key": main_topic_image_key})
629
+
630
+ # Save the final JSON.
631
+ self.extracted_subtopics[s3_key] = final_json
632
+
633
+ # Create a snippet to replace the markdown line.
634
+ snippet = ["**Extracted table cells:**"]
635
+ if main_topic_image_key:
636
+ snippet.append(f"![Header]({main_topic_image_key})")
637
+ for child in recognized_subtopics:
638
+ for content in child.get("contents", []):
639
+ snippet.append(f"![Child]({content.get('key')})")
640
+ new_snip = "\n".join(snippet)
641
+ old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({s3_key})"
642
+ md_content = md_content.replace(old_line, new_snip)
643
+
644
+ except Exception as e:
645
+ logger.error(f"Error processing table image {s3_key}: {e}")
646
+ finally:
647
+ os.remove(temp_path)
648
+
649
+ return md_content
650
+
651
+ def post_process(self, key: str, md_content: str) -> str:
652
+ return asyncio.run(self.post_process_async(key, md_content))
653
+
654
+ class GeminiTopicExtractor:
655
+ def __init__(self, api_key: str = None, num_pages: int = 14):
656
+ self.api_key = api_key or os.getenv("GEMINI_API_KEY", "")
657
+ self.num_pages = num_pages
658
+
659
+ def extract_subtopics(self, pdf_path: str) -> Dict[str, List[int]]:
660
+ first_pages_text = self._read_first_pages_raw(pdf_path, self.num_pages)
661
+ if not first_pages_text.strip():
662
+ logger.error("No text from first pages => cannot extract subtopics.")
663
+ return {}
664
+ prompt = f"""
665
+ You have the first pages of a PDF specification, including a table of contents.
666
+ Instructions:
667
+ 1. Identify the 'Contents' section listing all topics, subtopics, and their corresponding pages.
668
+ 2. Identify the major academic subtopics (common desired topic names "Paper X", "Theme X", "Content of X", "AS Unit X", "A2 Unit X", or similar headings).
669
+ 3. For each subtopic, give the range of pages [start_page, end_page] (1-based) from the table of contents.
670
+ 4. Output only valid JSON of the form:
671
+ {{
672
+ "Subtopic A": [start_page, end_page],
673
+ "Subtopic B": [start_page, end_page]
674
+ }}
675
+ 5. If you can't find any subtopics, return an empty JSON.
676
+ Important notes:
677
+ - The correct "end_page" must be the page number of the next topic or subtopic minus 1.
678
+ - The final output must be valid JSON only, with no extra text or code blocks.
679
+ Examples:
680
+ 1. Given this table of contents:
681
+ 1 Introduction – 2
682
+ Why choose Edexcel A Level Mathematics? - 2
683
+ Supporting you in planning and implementing this qualification - 3
684
+ Qualification at a glance - 5
685
+ 2 Subject content and assessment information – 7
686
+ Paper 1 and Paper 2: Pure Mathematics - 11
687
+ Paper 3: Statistics and Mechanics - 30
688
+ Assessment Objectives - 40
689
+ 3 Administration and general information – 42
690
+ Entries - 42
691
+ Access arrangements, reasonable adjustments, special consideration and malpractice - 42
692
+ Student recruitment and progression - 45
693
+ Appendix 1: Formulae – 49
694
+ Appendix 2: Notation – 53
695
+ Appendix 3: Use of calculators – 59
696
+ Appendix 4: Assessment Objectives – 60
697
+ Appendix 5: The context for the development of this qualification – 62
698
+ Appendix 6: Transferable skills – 64
699
+ Appendix 7: Level 3 Extended Project qualification – 65
700
+ Appendix 8: Codes – 67
701
+ The correct output should be:
702
+ {{
703
+ "Paper 1 and Paper 2: Pure Mathematics": [11, 29],
704
+ "Paper 3: Statistics and Mechanics": [30, 42]
705
+ }}
706
+ 2. Given this table of contents:
707
+ Qualification at a glance – 1
708
+ Assessment Objectives and weightings - 4
709
+ Knowledge, skills and understanding – 5
710
+ Theme 1: Introduction to markets and market failure - 5
711
+ Theme 2: The UK economy – performance and policies - 11
712
+ Theme 3: Business behaviour and the labour market - 21
713
+ Theme 4: A global perspective - 29
714
+ Assessment – 39
715
+ Assessment summary - 39
716
+ Assessment objectives - 41
717
+ Assessment overview - 42
718
+ Breakdown of assessment objectives - 42
719
+ Synoptic assessment - 43
720
+ Discount code and performance tables - 43
721
+ Access arrangements, reasonable adjustments and special consideration - 44
722
+ Malpractice - 45
723
+ Equality Act 2010 and Pearson equality policy - 45
724
+ Synoptic assessment - 46
725
+ Awarding and reporting - 47
726
+ Other information – 49
727
+ Student recruitment -49
728
+ Prior learning and other requirements -49
729
+ Progression - 49
730
+ Appendix 1: Transferable skills – 53
731
+ Appendix 2: Level 3 Extended Project qualification – 55
732
+ Appendix 3: Quantitative skills – 59
733
+ Appendix 4: Codes – 61
734
+ Appendix 5: Index – 63
735
+ The correct output should be:
736
+ {{
737
+ "Theme 1: Introduction to markets and market failure": [5, 10],
738
+ "Theme 2: The UK economy – performance and policies": [11, 20],
739
+ "Theme 3: Business behaviour and the labour market": [21, 28],
740
+ "Theme 4: A global perspective": [29, 38]
741
+ }}
742
+ 3. You might also see sections like:
743
+ 2.1 AS Unit 1 11
744
+ 2.2 AS Unit 2 18
745
+ 2.3 A2 Unit 3 24
746
+ 2.4 A2 Unit 4 31
747
+ In that scenario, your output might look like:
748
+ {{
749
+ "2.1 AS Unit 1": [11, 17],
750
+ "2.2 AS Unit 2": [18, 23],
751
+ "2.3 A2 Unit 3": [24, 30],
752
+ "2.4 A2 Unit 4": [31, 35]
753
+ }}
754
+ or
755
+ 2.1 AS units 6
756
+ 2.2 AS units 23
757
+ In that scenario, your output might look like:
758
+ {{
759
+ "2.1 AS Unit 1": [6, 2],
760
+ "2.2 AS Unit 2": [23, 43]
761
+ }}
762
+
763
+ 4. Another example might list subtopics:
764
+ 3.1 Overarching themes 11
765
+ 3.2 A: Proof 12
766
+ 3.3 B: Algebra and functions 13
767
+ 3.4 C: Coordinate geometry in the ( x , y ) plane 14
768
+ 3.5 D: Sequences and series 15
769
+ 3.6 E: Trigonometry 16
770
+ 3.7 F: Exponentials and logarithms 17
771
+ 3.8 G: Differentiation 18
772
+ 3.9 H: Integration 19
773
+ 3.10 I: Numerical methods 20
774
+ 3.11 J: Vectors 20
775
+ 3.12 K: Statistical sampling 21
776
+ 3.13 L: Data presentation and interpretation 21
777
+ 3.14 M: Probability 22
778
+ 3.15 N: Statistical distributions 23
779
+ 3.16 O: Statistical hypothesis testing 23
780
+ 3.17 P: Quantities and units in mechanics 24
781
+ 3.18 Q: Kinematics 24
782
+ 3.19 R: Forces and Newton’s laws 24
783
+ 3.20 S: Moments 25
784
+ 3.21 Use of data in statistics 26
785
+ Here the correct output might look like:
786
+ {{
787
+ "A: Proof": [12, 12],
788
+ "B: Algebra and functions": [13, 13],
789
+ ...
790
+ }}
791
+ Now, extract topics from this text:
792
+ {first_pages_text}
793
+ """
794
+ global _GEMINI_CLIENT
795
+ if _GEMINI_CLIENT is None:
796
+ _GEMINI_CLIENT = genai.Client(api_key=self.api_key)
797
+ client = _GEMINI_CLIENT
798
+ try:
799
+ response = client.models.generate_content(
800
+ model="gemini-2.0-flash",
801
+ contents=[prompt],
802
+ config=types.GenerateContentConfig(temperature=0.0)
803
+ )
804
+ if not response or not response.text:
805
+ logger.warning("No text from LLM => returning empty subtopics.")
806
+ return {}
807
+ raw_json = response.text.strip()
808
+ cleaned = raw_json.replace("```json", "").replace("```", "")
809
+ try:
810
+ data = json.loads(cleaned)
811
+ except Exception as json_err:
812
+ logger.error(f"JSON parsing error: {json_err}")
813
+ return {}
814
+ final_dict = {}
815
+ found_sub_dict = None
816
+ for k, v in data.items():
817
+ if isinstance(v, dict):
818
+ found_sub_dict = v
819
+ break
820
+ if found_sub_dict is not None:
821
+ for subk, rng in found_sub_dict.items():
822
+ if isinstance(rng, list) and len(rng) == 2:
823
+ final_dict[subk] = rng
824
+ else:
825
+ for subk, rng in data.items():
826
+ if isinstance(rng, list) and len(rng) == 2:
827
+ final_dict[subk] = rng
828
+ return final_dict
829
+ except Exception as e:
830
+ logger.error(f"Gemini subtopic extraction error: {e}")
831
+ return {}
832
+
833
+ def _read_first_pages_raw(self, pdf_path: str, num_pages: int) -> str:
834
+ text_parts = []
835
+ try:
836
+ if pdf_path.startswith("http://") or pdf_path.startswith("https://"):
837
+ response = requests.get(pdf_path)
838
+ if response.status_code != 200:
839
+ logger.error("Failed to download PDF from %s. Status code: %d", pdf_path, response.status_code)
840
+ return ""
841
+ pdf_bytes = response.content
842
+ else:
843
+ with open(pdf_path, "rb") as f:
844
+ pdf_bytes = f.read()
845
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
846
+ pages_to_read = min(num_pages, doc.page_count)
847
+ for i in range(pages_to_read):
848
+ raw_text = doc[i].get_text("raw")
849
+ text_parts.append(raw_text)
850
+ doc.close()
851
+ except Exception as e:
852
+ logger.error(f"Could not open PDF: {e}")
853
+ return "\n".join(text_parts)
854
+
855
+ class MineruNoTextProcessor:
856
+ def __init__(self, output_folder: str, gemini_api_key: str):
857
+ self.output_folder = output_folder
858
+ os.makedirs(self.output_folder, exist_ok=True)
859
+ self.layout_model = "doclayout_yolo"
860
+ self.formula_enable = True
861
+ self.table_enable = False
862
+ self.language = "en"
863
+
864
+ self.subtopic_extractor = GeminiTopicExtractor(api_key=gemini_api_key, num_pages=20)
865
+ self.gemini_api_key = gemini_api_key or os.getenv("GEMINI_API_KEY", "")
866
+
867
+ self.use_s3 = True
868
+ self.s3_writer = s3Writer(
869
+ ak=os.getenv("S3_ACCESS_KEY"),
870
+ sk=os.getenv("S3_SECRET_KEY"),
871
+ bucket="quextro-resources",
872
+ endpoint_url=os.getenv("S3_ENDPOINT")
873
+ )
874
+
875
+ def cleanup_gpu(self):
876
+ try:
877
+ gc.collect()
878
+ torch.cuda.empty_cache()
879
+ logger.info("GPU memory cleaned up.")
880
+ except Exception as e:
881
+ logger.error(f"Error during GPU cleanup: {e}")
882
+
883
+ def process(self, pdf_path: str) -> Dict[str, Any]:
884
+ logger.info(f"Processing PDF: {pdf_path}")
885
+ try:
886
+ subtopics = self.subtopic_extractor.extract_subtopics(pdf_path)
887
+ logger.info(f"Gemini returned subtopics: {subtopics}")
888
+
889
+ if pdf_path.startswith("http://") or pdf_path.startswith("https://"):
890
+ response = requests.get(pdf_path)
891
+ if response.status_code != 200:
892
+ logger.error("Failed to download PDF from %s. Status code: %d", pdf_path, response.status_code)
893
+ raise Exception(f"Failed to download PDF: {pdf_path}")
894
+ pdf_bytes = response.content
895
+ logger.info("Downloaded %d bytes for pdf_url='%s'", len(pdf_bytes), pdf_path)
896
+ else:
897
+ with open(pdf_path, "rb") as f:
898
+ pdf_bytes = f.read()
899
+ logger.info("Loaded %d bytes from local file '%s'", len(pdf_bytes), pdf_path)
900
+
901
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
902
+ total_pages = doc.page_count
903
+ doc.close()
904
+
905
+ # Decide which pages to process
906
+ final_pages = set()
907
+ if not subtopics:
908
+ # fallback
909
+ final_pages = set(range(total_pages))
910
+ else:
911
+ offset_candidates = []
912
+ for subname, rng in subtopics.items():
913
+ start_p, _ = rng
914
+ occs = find_all_occurrences(pdf_bytes, subname)
915
+ for p in occs:
916
+ candidate = p - (start_p - 1)
917
+ if candidate > 0:
918
+ offset_candidates.append(candidate)
919
+ if offset_candidates:
920
+ try:
921
+ from statistics import mode
922
+ global_offset = mode(offset_candidates)
923
+ except:
924
+ from statistics import median
925
+ global_offset = int(median(offset_candidates))
926
+ else:
927
+ global_offset = 0
928
+
929
+ logger.info(f"Computed global offset: {global_offset}")
930
+ for subname, rng in subtopics.items():
931
+ if not (isinstance(rng, list) and len(rng) == 2):
932
+ continue
933
+ start_p, end_p = rng
934
+ if start_p > end_p:
935
+ continue
936
+ s0 = (start_p - 1) + global_offset
937
+ e0 = (end_p - 1) + global_offset
938
+ for pp in range(s0, e0 + 1):
939
+ final_pages.add(pp)
940
+
941
+ if not final_pages:
942
+ final_pages = set(range(total_pages))
943
+
944
+ logger.info(f"Processing pages (0-based): {sorted(final_pages)}")
945
+ subset_pdf_bytes = create_subset_pdf(pdf_bytes, sorted(final_pages))
946
+
947
+ # 4) Analyze and produce markdown
948
+ dataset = PymuDocDataset(subset_pdf_bytes)
949
+ inference = doc_analyze(
950
+ dataset,
951
+ ocr=True,
952
+ lang=self.language,
953
+ layout_model=self.layout_model,
954
+ formula_enable=self.formula_enable,
955
+ table_enable=self.table_enable
956
+ )
957
+ # S3
958
+ writer = S3ImageWriter(self.s3_writer, "/topic-extraction", self.gemini_api_key)
959
+
960
+ md_prefix = "/topic-extraction/"
961
+ pipe_result = inference.pipe_ocr_mode(writer, lang=self.language)
962
+ md_content = pipe_result.get_markdown(md_prefix)
963
+ final_markdown = writer.post_process(md_prefix, md_content)
964
+
965
+ subtopic_list = list(writer.extracted_subtopics.values())
966
+ subtopic_list = merge_topics(subtopic_list)
967
+
968
+ out_path = os.path.join(self.output_folder, "_subtopics.json")
969
+ with open(out_path, "w", encoding="utf-8") as f:
970
+ json.dump(subtopic_list, f, indent=2)
971
+ logger.info(f"Final subtopics JSON saved locally at {out_path}")
972
+
973
+ return {
974
+ "final_markdown": final_markdown,
975
+ "subtopics_extracted": subtopic_list
976
+ }
977
+ finally:
978
+ self.cleanup_gpu()
979
+
980
+ if __name__ == "__main__":
981
+ input_pdf = "/home/user/app/input_output/wjec-gce-as-a-economics-specification-from-2015.pdf"
982
+ output_dir = "/home/user/app/pearson_json"
983
+ gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
984
+ try:
985
+ processor = MineruNoTextProcessor(output_folder=output_dir, gemini_api_key=gemini_key)
986
+ result = processor.process(input_pdf)
987
+ logger.info("Processing completed successfully.")
988
+ except Exception as e:
989
+ logger.error(f"Processing failed: {e}")
topic_extraction.log CHANGED
@@ -7483,3 +7483,237 @@ and series'. Using page 7.
7483
  2025-03-04 17:29:32,884 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_29.jpg
7484
  2025-03-04 17:29:33,308 [INFO] __main__ - Classifying images to detect tables.
7485
  2025-03-04 17:59:52,883 [INFO] __main__ - GPU memory cleaned up.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7483
  2025-03-04 17:29:32,884 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_29.jpg
7484
  2025-03-04 17:29:33,308 [INFO] __main__ - Classifying images to detect tables.
7485
  2025-03-04 17:59:52,883 [INFO] __main__ - GPU memory cleaned up.
7486
+ 2025-03-04 18:24:55,659 [INFO] __main__ - Processing PDF: /home/user/app/input_output/wjec-gce-as-a-economics-specification-from-2015.pdf
7487
+ 2025-03-04 18:24:56,486 [INFO] __main__ - Gemini returned subtopics: {'2.1AS units': [7, 22], '2.2A2 units': [23, 43]}
7488
+ 2025-03-04 18:24:56,487 [INFO] __main__ - Loaded 3543551 bytes from local file '/home/user/app/input_output/wjec-gce-as-a-economics-specification-from-2015.pdf'
7489
+ 2025-03-04 18:24:56,724 [INFO] __main__ - Computed global offset: 0
7490
+ 2025-03-04 18:24:56,725 [INFO] __main__ - Processing pages (0-based): [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42]
7491
+ 2025-03-04 18:26:37,627 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_1.jpg
7492
+ 2025-03-04 18:26:38,287 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_2.jpg
7493
+ 2025-03-04 18:26:38,720 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_3.jpg
7494
+ 2025-03-04 18:26:39,215 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_4.jpg
7495
+ 2025-03-04 18:26:39,531 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_5.jpg
7496
+ 2025-03-04 18:26:39,917 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_6.jpg
7497
+ 2025-03-04 18:26:40,490 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_7.jpg
7498
+ 2025-03-04 18:26:40,968 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_8.jpg
7499
+ 2025-03-04 18:26:41,372 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_9.jpg
7500
+ 2025-03-04 18:26:41,675 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_10.jpg
7501
+ 2025-03-04 18:26:42,251 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_11.jpg
7502
+ 2025-03-04 18:26:42,757 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_12.jpg
7503
+ 2025-03-04 18:26:43,326 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_13.jpg
7504
+ 2025-03-04 18:26:43,626 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_14.jpg
7505
+ 2025-03-04 18:26:44,254 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_15.jpg
7506
+ 2025-03-04 18:26:44,797 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_16.jpg
7507
+ 2025-03-04 18:26:45,300 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_17.jpg
7508
+ 2025-03-04 18:26:45,689 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_18.jpg
7509
+ 2025-03-04 18:26:46,237 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_19.jpg
7510
+ 2025-03-04 18:26:46,642 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_20.jpg
7511
+ 2025-03-04 18:26:47,162 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_21.jpg
7512
+ 2025-03-04 18:26:47,668 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_22.jpg
7513
+ 2025-03-04 18:26:48,043 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_23.jpg
7514
+ 2025-03-04 18:26:48,639 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_24.jpg
7515
+ 2025-03-04 18:26:49,154 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_25.jpg
7516
+ 2025-03-04 18:26:49,534 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_26.jpg
7517
+ 2025-03-04 18:26:50,096 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_27.jpg
7518
+ 2025-03-04 18:26:50,670 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_28.jpg
7519
+ 2025-03-04 18:26:51,044 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_29.jpg
7520
+ 2025-03-04 18:26:51,475 [INFO] __main__ - Classifying images to detect tables.
7521
+ 2025-03-04 18:26:56,074 [INFO] __main__ - Processing table image: /topic-extraction/img_1.jpg, columns=three
7522
+ 2025-03-04 18:26:59,389 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r0_c0.png
7523
+ 2025-03-04 18:27:00,348 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_1.jpg_r0_c0.png
7524
+ 2025-03-04 18:27:00,601 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r1_c0.png
7525
+ 2025-03-04 18:27:10,689 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r2_c0.png
7526
+ 2025-03-04 18:27:11,820 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r3_c0.png
7527
+ 2025-03-04 18:27:12,855 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r4_c0.png
7528
+ 2025-03-04 18:27:13,889 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_1.jpg_r4_c0.png
7529
+ 2025-03-04 18:27:13,890 [INFO] __main__ - Processing table image: /topic-extraction/img_2.jpg, columns=three
7530
+ 2025-03-04 18:27:17,341 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r0_c0.png
7531
+ 2025-03-04 18:27:18,536 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r1_c0.png
7532
+ 2025-03-04 18:27:19,842 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r2_c0.png
7533
+ 2025-03-04 18:27:20,887 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r3_c0.png
7534
+ 2025-03-04 18:27:22,626 [WARNING] __main__ - Cell image not found: /tmp/tmpns_p2pw7.jpg_rows/row_4/col_0.png
7535
+ 2025-03-04 18:27:22,626 [INFO] __main__ - Processing table image: /topic-extraction/img_3.jpg, columns=three
7536
+ 2025-03-04 18:27:24,756 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_3.jpg_r0_c0.png
7537
+ 2025-03-04 18:27:25,630 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_3.jpg_r0_c0.png
7538
+ 2025-03-04 18:27:25,976 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_3.jpg_r1_c0.png
7539
+ 2025-03-04 18:27:26,909 [WARNING] __main__ - Cell image not found: /tmp/tmpmkqp5iik.jpg_rows/row_2/col_0.png
7540
+ 2025-03-04 18:27:26,910 [INFO] __main__ - Processing table image: /topic-extraction/img_4.jpg, columns=three
7541
+ 2025-03-04 18:27:29,569 [WARNING] __main__ - Cell image not found: /tmp/tmpnakrpg49.jpg_rows/row_0/col_0.png
7542
+ 2025-03-04 18:27:29,569 [WARNING] __main__ - Cell image not found: /tmp/tmpnakrpg49.jpg_rows/row_0/col_1.png
7543
+ 2025-03-04 18:27:29,835 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r1_c0.png
7544
+ 2025-03-04 18:27:30,823 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_4.jpg_r1_c0.png
7545
+ 2025-03-04 18:27:30,823 [WARNING] __main__ - Cell image not found: /tmp/tmpnakrpg49.jpg_rows/row_1/col_1.png
7546
+ 2025-03-04 18:27:31,085 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r2_c0.png
7547
+ 2025-03-04 18:27:33,674 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r3_c0.png
7548
+ 2025-03-04 18:27:34,672 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r4_c0.png
7549
+ 2025-03-04 18:27:35,592 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_4.jpg_r4_c0.png
7550
+ 2025-03-04 18:27:35,593 [INFO] __main__ - Processing table image: /topic-extraction/img_5.jpg, columns=three
7551
+ 2025-03-04 18:27:36,679 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_5.jpg_r0_c0.png
7552
+ 2025-03-04 18:27:37,655 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_5.jpg_r0_c0.png
7553
+ 2025-03-04 18:27:37,997 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_5.jpg_r1_c0.png
7554
+ 2025-03-04 18:27:38,787 [WARNING] __main__ - Cell image not found: /tmp/tmp59baffv6.jpg_rows/row_2/col_0.png
7555
+ 2025-03-04 18:27:38,787 [INFO] __main__ - Processing table image: /topic-extraction/img_6.jpg, columns=three
7556
+ 2025-03-04 18:27:40,808 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_6.jpg_r0_c0.png
7557
+ 2025-03-04 18:27:41,806 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_6.jpg_r0_c0.png
7558
+ 2025-03-04 18:27:42,094 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_6.jpg_r1_c0.png
7559
+ 2025-03-04 18:27:43,132 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_6.jpg_r2_c0.png
7560
+ 2025-03-04 18:27:44,097 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_6.jpg_r2_c0.png
7561
+ 2025-03-04 18:27:44,097 [INFO] __main__ - Processing table image: /topic-extraction/img_7.jpg, columns=three
7562
+ 2025-03-04 18:27:47,411 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r0_c0.png
7563
+ 2025-03-04 18:27:48,353 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_7.jpg_r0_c0.png
7564
+ 2025-03-04 18:27:48,705 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r1_c0.png
7565
+ 2025-03-04 18:27:49,963 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r2_c0.png
7566
+ 2025-03-04 18:27:50,936 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r3_c0.png
7567
+ 2025-03-04 18:27:52,024 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_7.jpg_r3_c0.png
7568
+ 2025-03-04 18:27:52,025 [INFO] __main__ - Processing table image: /topic-extraction/img_8.jpg, columns=three
7569
+ 2025-03-04 18:27:54,377 [WARNING] __main__ - Cell image not found: /tmp/tmpsppe7tt4.jpg_rows/row_0/col_0.png
7570
+ 2025-03-04 18:27:54,378 [WARNING] __main__ - Cell image not found: /tmp/tmpsppe7tt4.jpg_rows/row_0/col_1.png
7571
+ 2025-03-04 18:27:54,639 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r1_c0.png
7572
+ 2025-03-04 18:27:55,574 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_8.jpg_r1_c0.png
7573
+ 2025-03-04 18:27:55,856 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r2_c0.png
7574
+ 2025-03-04 18:27:56,935 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r3_c0.png
7575
+ 2025-03-04 18:27:57,936 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r4_c0.png
7576
+ 2025-03-04 18:27:58,830 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_8.jpg_r4_c0.png
7577
+ 2025-03-04 18:27:58,830 [INFO] __main__ - Processing table image: /topic-extraction/img_9.jpg, columns=three
7578
+ 2025-03-04 18:28:00,927 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r0_c0.png
7579
+ 2025-03-04 18:28:01,839 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_9.jpg_r0_c0.png
7580
+ 2025-03-04 18:28:02,124 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r1_c0.png
7581
+ 2025-03-04 18:28:03,147 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r2_c0.png
7582
+ 2025-03-04 18:28:04,318 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r3_c0.png
7583
+ 2025-03-04 18:28:05,234 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r4_c0.png
7584
+ 2025-03-04 18:28:06,333 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_9.jpg_r4_c0.png
7585
+ 2025-03-04 18:28:06,333 [INFO] __main__ - Processing table image: /topic-extraction/img_10.jpg, columns=three
7586
+ 2025-03-04 18:28:07,300 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_10.jpg_r0_c0.png
7587
+ 2025-03-04 18:28:08,246 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_10.jpg_r0_c0.png
7588
+ 2025-03-04 18:28:08,508 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_10.jpg_r1_c0.png
7589
+ 2025-03-04 18:28:09,569 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_10.jpg_r2_c0.png
7590
+ 2025-03-04 18:28:10,602 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_10.jpg_r2_c0.png
7591
+ 2025-03-04 18:28:10,603 [INFO] __main__ - Processing table image: /topic-extraction/img_11.jpg, columns=three
7592
+ 2025-03-04 18:28:13,214 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_11.jpg_r0_c0.png
7593
+ 2025-03-04 18:28:14,131 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_11.jpg_r0_c0.png
7594
+ 2025-03-04 18:28:14,477 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_11.jpg_r1_c0.png
7595
+ 2025-03-04 18:28:15,765 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_11.jpg_r2_c0.png
7596
+ 2025-03-04 18:28:16,868 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_11.jpg_r2_c0.png
7597
+ 2025-03-04 18:28:16,869 [INFO] __main__ - Processing table image: /topic-extraction/img_12.jpg, columns=three
7598
+ 2025-03-04 18:28:19,488 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_12.jpg_r0_c0.png
7599
+ 2025-03-04 18:28:20,477 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_12.jpg_r0_c0.png
7600
+ 2025-03-04 18:28:20,850 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_12.jpg_r1_c0.png
7601
+ 2025-03-04 18:28:21,976 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_12.jpg_r2_c0.png
7602
+ 2025-03-04 18:28:22,922 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_12.jpg_r2_c0.png
7603
+ 2025-03-04 18:28:22,923 [INFO] __main__ - Processing table image: /topic-extraction/img_13.jpg, columns=three
7604
+ 2025-03-04 18:28:26,026 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r0_c0.png
7605
+ 2025-03-04 18:28:26,939 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_13.jpg_r0_c0.png
7606
+ 2025-03-04 18:28:27,213 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r0_c1.png
7607
+ 2025-03-04 18:28:28,270 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_13.jpg_r0_c1.png
7608
+ 2025-03-04 18:28:28,611 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r1_c0.png
7609
+ 2025-03-04 18:28:29,683 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r2_c0.png
7610
+ 2025-03-04 18:28:30,673 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_13.jpg_r2_c0.png
7611
+ 2025-03-04 18:28:30,933 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r3_c0.png
7612
+ 2025-03-04 18:28:31,996 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r4_c0.png
7613
+ 2025-03-04 18:28:32,949 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_13.jpg_r4_c0.png
7614
+ 2025-03-04 18:28:32,950 [INFO] __main__ - Processing table image: /topic-extraction/img_14.jpg, columns=three
7615
+ 2025-03-04 18:28:34,332 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_14.jpg_r0_c0.png
7616
+ 2025-03-04 18:28:35,272 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_14.jpg_r0_c0.png
7617
+ 2025-03-04 18:28:35,541 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_14.jpg_r1_c0.png
7618
+ 2025-03-04 18:28:36,537 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_14.jpg_r2_c0.png
7619
+ 2025-03-04 18:28:37,794 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_14.jpg_r2_c0.png
7620
+ 2025-03-04 18:28:37,794 [INFO] __main__ - Processing table image: /topic-extraction/img_15.jpg, columns=three
7621
+ 2025-03-04 18:28:43,119 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r0_c0.png
7622
+ 2025-03-04 18:28:44,084 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_15.jpg_r0_c0.png
7623
+ 2025-03-04 18:28:44,353 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r1_c0.png
7624
+ 2025-03-04 18:28:45,692 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r2_c0.png
7625
+ 2025-03-04 18:28:46,679 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r3_c0.png
7626
+ 2025-03-04 18:28:47,545 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r4_c0.png
7627
+ 2025-03-04 18:28:48,749 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_15.jpg_r4_c0.png
7628
+ 2025-03-04 18:28:48,749 [INFO] __main__ - Processing table image: /topic-extraction/img_16.jpg, columns=three
7629
+ 2025-03-04 18:28:51,810 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r0_c0.png
7630
+ 2025-03-04 18:28:52,802 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_16.jpg_r0_c0.png
7631
+ 2025-03-04 18:28:53,064 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r1_c0.png
7632
+ 2025-03-04 18:28:54,144 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r2_c0.png
7633
+ 2025-03-04 18:28:55,133 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r3_c0.png
7634
+ 2025-03-04 18:28:57,845 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r4_c0.png
7635
+ 2025-03-04 18:28:58,855 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r5_c0.png
7636
+ 2025-03-04 18:28:59,722 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_16.jpg_r5_c0.png
7637
+ 2025-03-04 18:28:59,722 [INFO] __main__ - Processing table image: /topic-extraction/img_17.jpg, columns=three
7638
+ 2025-03-04 18:29:02,875 [WARNING] __main__ - Cell image not found: /tmp/tmp0emfx_zt.jpg_rows/row_0/col_0.png
7639
+ 2025-03-04 18:29:03,148 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r1_c0.png
7640
+ 2025-03-04 18:29:04,098 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_17.jpg_r1_c0.png
7641
+ 2025-03-04 18:29:04,361 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r2_c0.png
7642
+ 2025-03-04 18:29:05,885 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r3_c0.png
7643
+ 2025-03-04 18:29:06,881 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r4_c0.png
7644
+ 2025-03-04 18:29:07,738 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_17.jpg_r4_c0.png
7645
+ 2025-03-04 18:29:07,739 [INFO] __main__ - Processing table image: /topic-extraction/img_18.jpg, columns=three
7646
+ 2025-03-04 18:29:09,552 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r0_c0.png
7647
+ 2025-03-04 18:29:10,757 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r1_c0.png
7648
+ 2025-03-04 18:29:11,784 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r2_c0.png
7649
+ 2025-03-04 18:29:12,800 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r3_c0.png
7650
+ 2025-03-04 18:29:13,609 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_18.jpg_r3_c0.png
7651
+ 2025-03-04 18:29:13,610 [INFO] __main__ - Processing table image: /topic-extraction/img_19.jpg, columns=three
7652
+ 2025-03-04 18:29:16,305 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r0_c0.png
7653
+ 2025-03-04 18:29:17,210 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_19.jpg_r0_c0.png
7654
+ 2025-03-04 18:29:17,472 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r1_c0.png
7655
+ 2025-03-04 18:29:18,587 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r2_c0.png
7656
+ 2025-03-04 18:29:19,610 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r3_c0.png
7657
+ 2025-03-04 18:29:20,792 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_19.jpg_r3_c0.png
7658
+ 2025-03-04 18:29:20,792 [INFO] __main__ - Processing table image: /topic-extraction/img_20.jpg, columns=three
7659
+ 2025-03-04 18:29:22,579 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_20.jpg_r0_c0.png
7660
+ 2025-03-04 18:29:23,599 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_20.jpg_r0_c0.png
7661
+ 2025-03-04 18:29:23,861 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_20.jpg_r1_c0.png
7662
+ 2025-03-04 18:29:24,796 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_20.jpg_r2_c0.png
7663
+ 2025-03-04 18:29:25,612 [WARNING] __main__ - Cell image not found: /tmp/tmpmxenc_0d.jpg_rows/row_3/col_0.png
7664
+ 2025-03-04 18:29:25,613 [INFO] __main__ - Processing table image: /topic-extraction/img_21.jpg, columns=three
7665
+ 2025-03-04 18:29:28,446 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_21.jpg_r0_c0.png
7666
+ 2025-03-04 18:29:29,404 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_21.jpg_r0_c0.png
7667
+ 2025-03-04 18:29:29,814 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_21.jpg_r1_c0.png
7668
+ 2025-03-04 18:29:30,864 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_21.jpg_r2_c0.png
7669
+ 2025-03-04 18:29:31,899 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_21.jpg_r2_c0.png
7670
+ 2025-03-04 18:29:31,899 [INFO] __main__ - Processing table image: /topic-extraction/img_22.jpg, columns=three
7671
+ 2025-03-04 18:29:34,452 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_22.jpg_r0_c0.png
7672
+ 2025-03-04 18:29:35,395 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_22.jpg_r0_c0.png
7673
+ 2025-03-04 18:29:35,740 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_22.jpg_r1_c0.png
7674
+ 2025-03-04 18:29:36,880 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_22.jpg_r2_c0.png
7675
+ 2025-03-04 18:29:37,830 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_22.jpg_r2_c0.png
7676
+ 2025-03-04 18:29:37,830 [INFO] __main__ - Processing table image: /topic-extraction/img_23.jpg, columns=three
7677
+ 2025-03-04 18:29:39,773 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_23.jpg_r0_c0.png
7678
+ 2025-03-04 18:29:40,725 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_23.jpg_r0_c0.png
7679
+ 2025-03-04 18:29:40,986 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_23.jpg_r1_c0.png
7680
+ 2025-03-04 18:29:41,800 [WARNING] __main__ - Cell image not found: /tmp/tmp1_2b4e5z.jpg_rows/row_2/col_0.png
7681
+ 2025-03-04 18:29:41,800 [INFO] __main__ - Processing table image: /topic-extraction/img_24.jpg, columns=three
7682
+ 2025-03-04 18:29:45,437 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_24.jpg_r0_c0.png
7683
+ 2025-03-04 18:29:46,443 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_24.jpg_r0_c0.png
7684
+ 2025-03-04 18:29:46,788 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_24.jpg_r1_c0.png
7685
+ 2025-03-04 18:29:47,654 [WARNING] __main__ - Cell image not found: /tmp/tmpyd5fc1x8.jpg_rows/row_2/col_0.png
7686
+ 2025-03-04 18:29:47,654 [INFO] __main__ - Processing table image: /topic-extraction/img_25.jpg, columns=three
7687
+ 2025-03-04 18:29:49,997 [WARNING] __main__ - Cell image not found: /tmp/tmpje6qj8ty.jpg_rows/row_0/col_0.png
7688
+ 2025-03-04 18:29:50,258 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_25.jpg_r1_c0.png
7689
+ 2025-03-04 18:29:51,237 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_25.jpg_r1_c0.png
7690
+ 2025-03-04 18:29:51,649 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_25.jpg_r2_c0.png
7691
+ 2025-03-04 18:29:52,817 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_25.jpg_r3_c0.png
7692
+ 2025-03-04 18:29:53,849 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_25.jpg_r3_c0.png
7693
+ 2025-03-04 18:29:53,849 [INFO] __main__ - Processing table image: /topic-extraction/img_26.jpg, columns=three
7694
+ 2025-03-04 18:29:55,903 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_26.jpg_r0_c0.png
7695
+ 2025-03-04 18:29:56,784 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_26.jpg_r0_c0.png
7696
+ 2025-03-04 18:29:57,121 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_26.jpg_r1_c0.png
7697
+ 2025-03-04 18:29:58,092 [WARNING] __main__ - Cell image not found: /tmp/tmple_xivqw.jpg_rows/row_2/col_0.png
7698
+ 2025-03-04 18:29:58,092 [INFO] __main__ - Processing table image: /topic-extraction/img_27.jpg, columns=three
7699
+ 2025-03-04 18:30:01,339 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r0_c0.png
7700
+ 2025-03-04 18:30:02,324 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_27.jpg_r0_c0.png
7701
+ 2025-03-04 18:30:02,680 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r1_c0.png
7702
+ 2025-03-04 18:30:03,795 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r2_c0.png
7703
+ 2025-03-04 18:30:04,805 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r3_c0.png
7704
+ 2025-03-04 18:30:05,808 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_27.jpg_r3_c0.png
7705
+ 2025-03-04 18:30:05,809 [INFO] __main__ - Processing table image: /topic-extraction/img_28.jpg, columns=three
7706
+ 2025-03-04 18:30:08,340 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_28.jpg_r0_c0.png
7707
+ 2025-03-04 18:30:09,205 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_28.jpg_r0_c0.png
7708
+ 2025-03-04 18:30:09,541 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_28.jpg_r1_c0.png
7709
+ 2025-03-04 18:30:11,786 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_28.jpg_r2_c0.png
7710
+ 2025-03-04 18:30:12,603 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_28.jpg_r2_c0.png
7711
+ 2025-03-04 18:30:12,603 [INFO] __main__ - Processing table image: /topic-extraction/img_29.jpg, columns=three
7712
+ 2025-03-04 18:30:14,423 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_29.jpg_r0_c0.png
7713
+ 2025-03-04 18:30:15,408 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_29.jpg_r0_c0.png
7714
+ 2025-03-04 18:30:15,669 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_29.jpg_r1_c0.png
7715
+ 2025-03-04 18:30:18,844 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_29.jpg_r2_c0.png
7716
+ 2025-03-04 18:30:20,616 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_29.jpg_r2_c0.png
7717
+ 2025-03-04 18:30:20,620 [INFO] __main__ - Final subtopics JSON saved locally at /home/user/app/pearson_json/_subtopics.json
7718
+ 2025-03-04 18:30:20,956 [INFO] __main__ - GPU memory cleaned up.
7719
+ 2025-03-04 18:30:20,961 [INFO] __main__ - Processing completed successfully.