SkyNait commited on
Commit
ae8cbf3
·
1 Parent(s): 98af312

fix RabbitMQ

Browse files
__pycache__/topic_extraction.cpython-310.pyc CHANGED
Binary files a/__pycache__/topic_extraction.cpython-310.pyc and b/__pycache__/topic_extraction.cpython-310.pyc differ
 
p_ms.PDF DELETED
Binary file (291 kB)
 
pearson_json/_subtopics.json CHANGED
@@ -6,6 +6,10 @@
6
  "type": "image",
7
  "key": "/topic-extraction/cells/img_1.jpg_r0_c0.png"
8
  },
 
 
 
 
9
  {
10
  "type": "image",
11
  "key": "/topic-extraction/cells/img_3.jpg_r0_c0.png"
@@ -24,177 +28,95 @@
24
  },
25
  {
26
  "type": "image",
27
- "key": "/topic-extraction/cells/img_9.jpg_r0_c0.png"
28
  },
29
  {
30
  "type": "image",
31
- "key": "/topic-extraction/cells/img_15.jpg_r0_c0.png"
32
  },
33
  {
34
  "type": "image",
35
- "key": "/topic-extraction/cells/img_16.jpg_r0_c0.png"
36
  },
37
  {
38
  "type": "image",
39
- "key": "/topic-extraction/cells/img_18.jpg_r0_c0.png"
40
  },
41
  {
42
  "type": "image",
43
- "key": "/topic-extraction/cells/img_19.jpg_r0_c0.png"
44
  },
45
  {
46
  "type": "image",
47
- "key": "/topic-extraction/cells/img_20.jpg_r0_c0.png"
48
  },
49
  {
50
  "type": "image",
51
- "key": "/topic-extraction/cells/img_22.jpg_r0_c0.png"
52
  },
53
  {
54
  "type": "image",
55
- "key": "/topic-extraction/cells/img_23.jpg_r0_c0.png"
56
  },
57
  {
58
  "type": "image",
59
- "key": "/topic-extraction/cells/img_27.jpg_r0_c0.png"
60
  },
61
  {
62
  "type": "image",
63
- "key": "/topic-extraction/cells/img_28.jpg_r0_c0.png"
64
  },
65
  {
66
  "type": "image",
67
- "key": "/topic-extraction/cells/img_29.jpg_r0_c0.png"
68
- }
69
- ],
70
- "children": []
71
- },
72
- {
73
- "title": "Factors influencing demand and supply in product markets",
74
- "contents": [
75
- {
76
- "type": "image",
77
- "key": "/topic-extraction/cells/img_2.jpg_r1_c0.png"
78
- }
79
- ],
80
- "children": []
81
- },
82
- {
83
- "title": "Why and how governments intervene in markets",
84
- "contents": [
85
  {
86
  "type": "image",
87
- "key": "/topic-extraction/cells/img_7.jpg_r1_c0.png"
88
- }
89
- ],
90
- "children": []
91
- },
92
- {
93
- "title": "The circular flow of income model",
94
- "contents": [
95
  {
96
  "type": "image",
97
- "key": "/topic-extraction/cells/img_8.jpg_r2_c0.png"
98
- }
99
- ],
100
- "children": []
101
- },
102
- {
103
- "title": "Government policy objectives",
104
- "contents": [
105
  {
106
  "type": "image",
107
- "key": "/topic-extraction/cells/img_10.jpg_r1_c0.png"
108
- }
109
- ],
110
- "children": []
111
- },
112
- {
113
- "title": "Fiscal policy",
114
- "contents": [
115
  {
116
  "type": "image",
117
- "key": "/topic-extraction/cells/img_11.jpg_r1_c0.png"
118
- }
119
- ],
120
- "children": []
121
- },
122
- {
123
- "title": "Monetary policy",
124
- "contents": [
125
  {
126
  "type": "image",
127
- "key": "/topic-extraction/cells/img_12.jpg_r1_c0.png"
128
- }
129
- ],
130
- "children": []
131
- },
132
- {
133
- "title": "Exchange rates and exchange rate policy",
134
- "contents": [
135
  {
136
  "type": "image",
137
- "key": "/topic-extraction/cells/img_13.jpg_r1_c0.png"
138
- }
139
- ],
140
- "children": []
141
- },
142
- {
143
- "title": "Free trade and protectionism",
144
- "contents": [
145
  {
146
  "type": "image",
147
- "key": "/topic-extraction/cells/img_14.jpg_r1_c0.png"
148
- }
149
- ],
150
- "children": []
151
- },
152
- {
153
- "title": "Monopoly",
154
- "contents": [
155
  {
156
  "type": "image",
157
- "key": "/topic-extraction/cells/img_17.jpg_r2_c0.png"
158
- }
159
- ],
160
- "children": []
161
- },
162
- {
163
- "title": "Economic growth",
164
- "contents": [
165
  {
166
  "type": "image",
167
- "key": "/topic-extraction/cells/img_21.jpg_r1_c0.png"
168
- }
169
- ],
170
- "children": []
171
- },
172
- {
173
- "title": "Inflation and deflation",
174
- "contents": [
175
  {
176
  "type": "image",
177
- "key": "/topic-extraction/cells/img_24.jpg_r1_c0.png"
178
- }
179
- ],
180
- "children": []
181
- },
182
- {
183
- "title": "The balance of payments",
184
- "contents": [
185
  {
186
  "type": "image",
187
- "key": "/topic-extraction/cells/img_25.jpg_r2_c0.png"
188
- }
189
- ],
190
- "children": []
191
- },
192
- {
193
- "title": "Control of the national (public sector) debt",
194
- "contents": [
195
  {
196
  "type": "image",
197
- "key": "/topic-extraction/cells/img_26.jpg_r1_c0.png"
198
  }
199
  ],
200
  "children": []
 
6
  "type": "image",
7
  "key": "/topic-extraction/cells/img_1.jpg_r0_c0.png"
8
  },
9
+ {
10
+ "type": "image",
11
+ "key": "/topic-extraction/cells/img_2.jpg_r0_c0.png"
12
+ },
13
  {
14
  "type": "image",
15
  "key": "/topic-extraction/cells/img_3.jpg_r0_c0.png"
 
28
  },
29
  {
30
  "type": "image",
31
+ "key": "/topic-extraction/cells/img_7.jpg_r0_c0.png"
32
  },
33
  {
34
  "type": "image",
35
+ "key": "/topic-extraction/cells/img_8.jpg_r1_c0.png"
36
  },
37
  {
38
  "type": "image",
39
+ "key": "/topic-extraction/cells/img_9.jpg_r0_c0.png"
40
  },
41
  {
42
  "type": "image",
43
+ "key": "/topic-extraction/cells/img_10.jpg_r0_c0.png"
44
  },
45
  {
46
  "type": "image",
47
+ "key": "/topic-extraction/cells/img_11.jpg_r0_c0.png"
48
  },
49
  {
50
  "type": "image",
51
+ "key": "/topic-extraction/cells/img_12.jpg_r0_c0.png"
52
  },
53
  {
54
  "type": "image",
55
+ "key": "/topic-extraction/cells/img_13.jpg_r0_c1.png"
56
  },
57
  {
58
  "type": "image",
59
+ "key": "/topic-extraction/cells/img_14.jpg_r0_c0.png"
60
  },
61
  {
62
  "type": "image",
63
+ "key": "/topic-extraction/cells/img_15.jpg_r0_c0.png"
64
  },
65
  {
66
  "type": "image",
67
+ "key": "/topic-extraction/cells/img_16.jpg_r0_c0.png"
68
  },
69
  {
70
  "type": "image",
71
+ "key": "/topic-extraction/cells/img_17.jpg_r1_c0.png"
72
+ },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  {
74
  "type": "image",
75
+ "key": "/topic-extraction/cells/img_18.jpg_r0_c0.png"
76
+ },
 
 
 
 
 
 
77
  {
78
  "type": "image",
79
+ "key": "/topic-extraction/cells/img_19.jpg_r0_c0.png"
80
+ },
 
 
 
 
 
 
81
  {
82
  "type": "image",
83
+ "key": "/topic-extraction/cells/img_20.jpg_r0_c0.png"
84
+ },
 
 
 
 
 
 
85
  {
86
  "type": "image",
87
+ "key": "/topic-extraction/cells/img_21.jpg_r0_c0.png"
88
+ },
 
 
 
 
 
 
89
  {
90
  "type": "image",
91
+ "key": "/topic-extraction/cells/img_22.jpg_r0_c0.png"
92
+ },
 
 
 
 
 
 
93
  {
94
  "type": "image",
95
+ "key": "/topic-extraction/cells/img_23.jpg_r0_c0.png"
96
+ },
 
 
 
 
 
 
97
  {
98
  "type": "image",
99
+ "key": "/topic-extraction/cells/img_24.jpg_r0_c0.png"
100
+ },
 
 
 
 
 
 
101
  {
102
  "type": "image",
103
+ "key": "/topic-extraction/cells/img_25.jpg_r1_c0.png"
104
+ },
 
 
 
 
 
 
105
  {
106
  "type": "image",
107
+ "key": "/topic-extraction/cells/img_26.jpg_r0_c0.png"
108
+ },
 
 
 
 
 
 
109
  {
110
  "type": "image",
111
+ "key": "/topic-extraction/cells/img_27.jpg_r0_c0.png"
112
+ },
 
 
 
 
 
 
113
  {
114
  "type": "image",
115
+ "key": "/topic-extraction/cells/img_28.jpg_r0_c0.png"
116
+ },
 
 
 
 
 
 
117
  {
118
  "type": "image",
119
+ "key": "/topic-extraction/cells/img_29.jpg_r0_c0.png"
120
  }
121
  ],
122
  "children": []
topic_extraction.log CHANGED
@@ -6931,3 +6931,555 @@ and series'. Using page 7.
6931
  2025-03-04 16:17:43,682 [INFO] __main__ - Final subtopics JSON saved locally at /home/user/app/pearson_json/_subtopics.json
6932
  2025-03-04 16:17:43,995 [INFO] __main__ - GPU memory cleaned up.
6933
  2025-03-04 16:17:44,000 [INFO] __main__ - Processing completed successfully.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6931
  2025-03-04 16:17:43,682 [INFO] __main__ - Final subtopics JSON saved locally at /home/user/app/pearson_json/_subtopics.json
6932
  2025-03-04 16:17:43,995 [INFO] __main__ - GPU memory cleaned up.
6933
  2025-03-04 16:17:44,000 [INFO] __main__ - Processing completed successfully.
6934
+ 2025-03-04 16:39:05,313 [INFO] __main__ - Processing PDF: /home/user/app/input_output/wjec-gce-as-a-economics-specification-from-2015.pdf
6935
+ 2025-03-04 16:39:06,086 [INFO] __main__ - Gemini returned subtopics: {'2.1AS units': [7, 22], '2.2A2 units': [23, 43]}
6936
+ 2025-03-04 16:39:06,088 [INFO] __main__ - Loaded 3543551 bytes from local file '/home/user/app/input_output/wjec-gce-as-a-economics-specification-from-2015.pdf'
6937
+ 2025-03-04 16:39:06,326 [INFO] __main__ - Computed global offset: 0
6938
+ 2025-03-04 16:39:06,326 [INFO] __main__ - Processing pages (0-based): [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42]
6939
+ 2025-03-04 16:39:49,136 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_1.jpg
6940
+ 2025-03-04 16:39:49,708 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_2.jpg
6941
+ 2025-03-04 16:39:50,157 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_3.jpg
6942
+ 2025-03-04 16:39:50,688 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_4.jpg
6943
+ 2025-03-04 16:39:51,083 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_5.jpg
6944
+ 2025-03-04 16:39:51,533 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_6.jpg
6945
+ 2025-03-04 16:39:52,100 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_7.jpg
6946
+ 2025-03-04 16:39:52,532 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_8.jpg
6947
+ 2025-03-04 16:39:52,942 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_9.jpg
6948
+ 2025-03-04 16:39:53,244 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_10.jpg
6949
+ 2025-03-04 16:39:53,742 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_11.jpg
6950
+ 2025-03-04 16:39:54,213 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_12.jpg
6951
+ 2025-03-04 16:39:54,761 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_13.jpg
6952
+ 2025-03-04 16:39:55,050 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_14.jpg
6953
+ 2025-03-04 16:39:55,740 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_15.jpg
6954
+ 2025-03-04 16:39:56,304 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_16.jpg
6955
+ 2025-03-04 16:39:56,780 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_17.jpg
6956
+ 2025-03-04 16:39:57,175 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_18.jpg
6957
+ 2025-03-04 16:39:57,748 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_19.jpg
6958
+ 2025-03-04 16:39:58,140 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_20.jpg
6959
+ 2025-03-04 16:39:58,682 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_21.jpg
6960
+ 2025-03-04 16:39:59,190 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_22.jpg
6961
+ 2025-03-04 16:39:59,577 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_23.jpg
6962
+ 2025-03-04 16:40:00,229 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_24.jpg
6963
+ 2025-03-04 16:40:00,732 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_25.jpg
6964
+ 2025-03-04 16:40:01,136 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_26.jpg
6965
+ 2025-03-04 16:40:01,706 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_27.jpg
6966
+ 2025-03-04 16:40:02,236 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_28.jpg
6967
+ 2025-03-04 16:40:02,621 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_29.jpg
6968
+ 2025-03-04 16:40:03,051 [INFO] __main__ - Classifying images to detect tables.
6969
+ 2025-03-04 16:40:06,927 [INFO] __main__ - Processing table image: /topic-extraction/img_1.jpg, columns=three
6970
+ 2025-03-04 16:40:10,403 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r0_c0.png
6971
+ 2025-03-04 16:40:11,481 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r1_c0.png
6972
+ 2025-03-04 16:40:12,796 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r1_c1.png
6973
+ 2025-03-04 16:40:13,767 [ERROR] __main__ - Gemini subtopic identification error on attempt 0: Expecting value: line 1 column 1 (char 0)
6974
+ 2025-03-04 16:40:15,308 [ERROR] __main__ - Gemini subtopic identification error on attempt 1: Expecting value: line 1 column 1 (char 0)
6975
+ 2025-03-04 16:40:15,585 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r2_c0.png
6976
+ 2025-03-04 16:40:18,265 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r2_c1.png
6977
+ 2025-03-04 16:40:19,708 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r3_c0.png
6978
+ 2025-03-04 16:40:20,908 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r3_c1.png
6979
+ 2025-03-04 16:40:22,033 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r4_c0.png
6980
+ 2025-03-04 16:40:22,999 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_1.jpg_r4_c0.png
6981
+ 2025-03-04 16:40:22,999 [INFO] __main__ - Processing table image: /topic-extraction/img_2.jpg, columns=three
6982
+ 2025-03-04 16:40:26,396 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r0_c0.png
6983
+ 2025-03-04 16:40:27,834 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r1_c0.png
6984
+ 2025-03-04 16:40:29,314 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r1_c1.png
6985
+ 2025-03-04 16:40:30,652 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r2_c0.png
6986
+ 2025-03-04 16:40:32,068 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r2_c1.png
6987
+ 2025-03-04 16:40:33,239 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r3_c0.png
6988
+ 2025-03-04 16:40:34,633 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r3_c1.png
6989
+ 2025-03-04 16:40:35,597 [WARNING] __main__ - Cell image not found: /tmp/tmpkr6p74mz.jpg_rows/row_4/col_0.png
6990
+ 2025-03-04 16:40:35,598 [INFO] __main__ - Processing table image: /topic-extraction/img_3.jpg, columns=three
6991
+ 2025-03-04 16:40:38,470 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_3.jpg_r0_c0.png
6992
+ 2025-03-04 16:40:39,732 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_3.jpg_r1_c0.png
6993
+ 2025-03-04 16:40:41,236 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_3.jpg_r1_c1.png
6994
+ 2025-03-04 16:40:42,293 [WARNING] __main__ - Cell image not found: /tmp/tmp_4ioykgq.jpg_rows/row_2/col_0.png
6995
+ 2025-03-04 16:40:42,293 [INFO] __main__ - Processing table image: /topic-extraction/img_4.jpg, columns=three
6996
+ 2025-03-04 16:40:44,974 [WARNING] __main__ - Cell image not found: /tmp/tmp8qnr07bo.jpg_rows/row_0/col_0.png
6997
+ 2025-03-04 16:40:44,974 [WARNING] __main__ - Cell image not found: /tmp/tmp8qnr07bo.jpg_rows/row_0/col_1.png
6998
+ 2025-03-04 16:40:45,250 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r1_c0.png
6999
+ 2025-03-04 16:40:46,109 [WARNING] __main__ - Cell image not found: /tmp/tmp8qnr07bo.jpg_rows/row_1/col_1.png
7000
+ 2025-03-04 16:40:46,385 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r2_c0.png
7001
+ 2025-03-04 16:40:47,759 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r2_c1.png
7002
+ 2025-03-04 16:40:58,221 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r3_c0.png
7003
+ 2025-03-04 16:40:59,680 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r3_c1.png
7004
+ 2025-03-04 16:41:00,920 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r4_c0.png
7005
+ 2025-03-04 16:41:01,935 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_4.jpg_r4_c0.png
7006
+ 2025-03-04 16:41:01,936 [INFO] __main__ - Processing table image: /topic-extraction/img_5.jpg, columns=three
7007
+ 2025-03-04 16:41:03,077 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_5.jpg_r0_c0.png
7008
+ 2025-03-04 16:41:04,211 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_5.jpg_r1_c0.png
7009
+ 2025-03-04 16:41:05,575 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_5.jpg_r1_c1.png
7010
+ 2025-03-04 16:41:06,564 [WARNING] __main__ - Cell image not found: /tmp/tmph28hdp5v.jpg_rows/row_2/col_0.png
7011
+ 2025-03-04 16:41:06,564 [INFO] __main__ - Processing table image: /topic-extraction/img_6.jpg, columns=three
7012
+ 2025-03-04 16:41:08,734 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_6.jpg_r0_c0.png
7013
+ 2025-03-04 16:41:09,813 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_6.jpg_r1_c0.png
7014
+ 2025-03-04 16:41:11,241 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_6.jpg_r1_c1.png
7015
+ 2025-03-04 16:41:12,566 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_6.jpg_r2_c0.png
7016
+ 2025-03-04 16:41:13,534 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_6.jpg_r2_c0.png
7017
+ 2025-03-04 16:41:13,535 [INFO] __main__ - Processing table image: /topic-extraction/img_7.jpg, columns=three
7018
+ 2025-03-04 16:41:17,251 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r0_c0.png
7019
+ 2025-03-04 16:41:18,440 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r1_c0.png
7020
+ 2025-03-04 16:41:20,101 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r1_c1.png
7021
+ 2025-03-04 16:41:21,465 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r2_c0.png
7022
+ 2025-03-04 16:41:22,836 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r2_c1.png
7023
+ 2025-03-04 16:41:24,168 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r3_c0.png
7024
+ 2025-03-04 16:41:25,738 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_7.jpg_r3_c0.png
7025
+ 2025-03-04 16:41:25,738 [INFO] __main__ - Processing table image: /topic-extraction/img_8.jpg, columns=three
7026
+ 2025-03-04 16:41:28,044 [WARNING] __main__ - Cell image not found: /tmp/tmp2s0xxpac.jpg_rows/row_0/col_0.png
7027
+ 2025-03-04 16:41:28,044 [WARNING] __main__ - Cell image not found: /tmp/tmp2s0xxpac.jpg_rows/row_0/col_1.png
7028
+ 2025-03-04 16:41:28,317 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r1_c0.png
7029
+ 2025-03-04 16:41:29,465 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r2_c0.png
7030
+ 2025-03-04 16:41:30,769 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r2_c1.png
7031
+ 2025-03-04 16:41:32,249 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r3_c0.png
7032
+ 2025-03-04 16:41:33,665 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r3_c1.png
7033
+ 2025-03-04 16:41:34,812 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r4_c0.png
7034
+ 2025-03-04 16:41:35,736 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_8.jpg_r4_c0.png
7035
+ 2025-03-04 16:41:35,737 [INFO] __main__ - Processing table image: /topic-extraction/img_9.jpg, columns=three
7036
+ 2025-03-04 16:41:38,094 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r0_c0.png
7037
+ 2025-03-04 16:41:39,194 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r1_c0.png
7038
+ 2025-03-04 16:41:40,315 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r1_c1.png
7039
+ 2025-03-04 16:41:41,569 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r2_c0.png
7040
+ 2025-03-04 16:41:42,914 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r2_c1.png
7041
+ 2025-03-04 16:41:44,447 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r3_c0.png
7042
+ 2025-03-04 16:41:45,567 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r3_c1.png
7043
+ 2025-03-04 16:41:46,689 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r4_c0.png
7044
+ 2025-03-04 16:41:47,934 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_9.jpg_r4_c0.png
7045
+ 2025-03-04 16:41:47,935 [INFO] __main__ - Processing table image: /topic-extraction/img_10.jpg, columns=three
7046
+ 2025-03-04 16:41:48,856 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_10.jpg_r0_c0.png
7047
+ 2025-03-04 16:41:49,986 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_10.jpg_r1_c0.png
7048
+ 2025-03-04 16:41:51,601 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_10.jpg_r1_c1.png
7049
+ 2025-03-04 16:41:52,769 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_10.jpg_r2_c0.png
7050
+ 2025-03-04 16:41:53,870 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_10.jpg_r2_c0.png
7051
+ 2025-03-04 16:41:53,871 [INFO] __main__ - Processing table image: /topic-extraction/img_11.jpg, columns=three
7052
+ 2025-03-04 16:41:56,443 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_11.jpg_r0_c0.png
7053
+ 2025-03-04 16:41:57,555 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_11.jpg_r1_c0.png
7054
+ 2025-03-04 16:41:59,137 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_11.jpg_r1_c1.png
7055
+ 2025-03-04 16:42:00,476 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_11.jpg_r2_c0.png
7056
+ 2025-03-04 16:42:01,609 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_11.jpg_r2_c0.png
7057
+ 2025-03-04 16:42:01,610 [INFO] __main__ - Processing table image: /topic-extraction/img_12.jpg, columns=three
7058
+ 2025-03-04 16:42:04,361 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_12.jpg_r0_c0.png
7059
+ 2025-03-04 16:42:05,501 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_12.jpg_r1_c0.png
7060
+ 2025-03-04 16:42:07,023 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_12.jpg_r1_c1.png
7061
+ 2025-03-04 16:42:08,304 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_12.jpg_r2_c0.png
7062
+ 2025-03-04 16:42:09,424 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_12.jpg_r2_c0.png
7063
+ 2025-03-04 16:42:09,425 [INFO] __main__ - Processing table image: /topic-extraction/img_13.jpg, columns=three
7064
+ 2025-03-04 16:42:12,614 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r0_c0.png
7065
+ 2025-03-04 16:42:13,436 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_13.jpg_r0_c0.png
7066
+ 2025-03-04 16:42:13,691 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r0_c1.png
7067
+ 2025-03-04 16:42:14,930 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r1_c0.png
7068
+ 2025-03-04 16:42:16,395 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r1_c1.png
7069
+ 2025-03-04 16:42:17,794 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r2_c0.png
7070
+ 2025-03-04 16:42:18,797 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_13.jpg_r2_c0.png
7071
+ 2025-03-04 16:42:19,053 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r3_c0.png
7072
+ 2025-03-04 16:42:20,378 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r3_c1.png
7073
+ 2025-03-04 16:42:21,565 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r4_c0.png
7074
+ 2025-03-04 16:42:22,635 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_13.jpg_r4_c0.png
7075
+ 2025-03-04 16:42:22,635 [INFO] __main__ - Processing table image: /topic-extraction/img_14.jpg, columns=three
7076
+ 2025-03-04 16:42:23,713 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_14.jpg_r0_c0.png
7077
+ 2025-03-04 16:42:24,787 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_14.jpg_r1_c0.png
7078
+ 2025-03-04 16:42:26,077 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_14.jpg_r1_c1.png
7079
+ 2025-03-04 16:42:27,195 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_14.jpg_r2_c0.png
7080
+ 2025-03-04 16:42:28,273 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_14.jpg_r2_c0.png
7081
+ 2025-03-04 16:42:28,274 [INFO] __main__ - Processing table image: /topic-extraction/img_15.jpg, columns=three
7082
+ 2025-03-04 16:42:32,234 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r0_c0.png
7083
+ 2025-03-04 16:42:33,414 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r1_c0.png
7084
+ 2025-03-04 16:42:35,062 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r1_c1.png
7085
+ 2025-03-04 16:42:36,362 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r2_c0.png
7086
+ 2025-03-04 16:42:37,790 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r2_c1.png
7087
+ 2025-03-04 16:42:38,877 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r3_c0.png
7088
+ 2025-03-04 16:42:40,011 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r3_c1.png
7089
+ 2025-03-04 16:42:41,094 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r4_c0.png
7090
+ 2025-03-04 16:42:42,019 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_15.jpg_r4_c0.png
7091
+ 2025-03-04 16:42:42,020 [INFO] __main__ - Processing table image: /topic-extraction/img_16.jpg, columns=three
7092
+ 2025-03-04 16:42:45,163 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r0_c0.png
7093
+ 2025-03-04 16:42:46,253 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r1_c0.png
7094
+ 2025-03-04 16:42:47,665 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r1_c1.png
7095
+ 2025-03-04 16:42:48,812 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r2_c0.png
7096
+ 2025-03-04 16:42:50,033 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r2_c1.png
7097
+ 2025-03-04 16:42:51,432 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r3_c0.png
7098
+ 2025-03-04 16:42:52,858 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r3_c1.png
7099
+ 2025-03-04 16:42:54,216 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r4_c0.png
7100
+ 2025-03-04 16:42:55,778 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r4_c1.png
7101
+ 2025-03-04 16:42:56,931 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r5_c0.png
7102
+ 2025-03-04 16:42:57,851 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_16.jpg_r5_c0.png
7103
+ 2025-03-04 16:42:57,851 [INFO] __main__ - Processing table image: /topic-extraction/img_17.jpg, columns=three
7104
+ 2025-03-04 16:43:01,201 [WARNING] __main__ - Cell image not found: /tmp/tmpdmvh3rc8.jpg_rows/row_0/col_0.png
7105
+ 2025-03-04 16:43:01,475 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r1_c0.png
7106
+ 2025-03-04 16:43:02,567 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r2_c0.png
7107
+ 2025-03-04 16:43:04,176 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r2_c1.png
7108
+ 2025-03-04 16:43:05,365 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r3_c0.png
7109
+ 2025-03-04 16:43:06,802 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r3_c1.png
7110
+ 2025-03-04 16:43:07,969 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r4_c0.png
7111
+ 2025-03-04 16:43:08,946 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_17.jpg_r4_c0.png
7112
+ 2025-03-04 16:43:08,946 [INFO] __main__ - Processing table image: /topic-extraction/img_18.jpg, columns=three
7113
+ 2025-03-04 16:43:10,806 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r0_c0.png
7114
+ 2025-03-04 16:43:11,925 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r1_c0.png
7115
+ 2025-03-04 16:43:13,286 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r1_c1.png
7116
+ 2025-03-04 16:43:14,651 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r2_c0.png
7117
+ 2025-03-04 16:43:16,070 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r2_c1.png
7118
+ 2025-03-04 16:43:17,209 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r3_c0.png
7119
+ 2025-03-04 16:43:18,206 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_18.jpg_r3_c0.png
7120
+ 2025-03-04 16:43:18,206 [INFO] __main__ - Processing table image: /topic-extraction/img_19.jpg, columns=three
7121
+ 2025-03-04 16:43:21,350 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r0_c0.png
7122
+ 2025-03-04 16:43:22,479 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r1_c0.png
7123
+ 2025-03-04 16:43:24,003 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r1_c1.png
7124
+ 2025-03-04 16:43:25,317 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r2_c0.png
7125
+ 2025-03-04 16:43:26,815 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r2_c1.png
7126
+ 2025-03-04 16:43:28,078 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r3_c0.png
7127
+ 2025-03-04 16:43:29,086 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_19.jpg_r3_c0.png
7128
+ 2025-03-04 16:43:29,086 [INFO] __main__ - Processing table image: /topic-extraction/img_20.jpg, columns=three
7129
+ 2025-03-04 16:43:30,918 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_20.jpg_r0_c0.png
7130
+ 2025-03-04 16:43:32,141 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_20.jpg_r1_c0.png
7131
+ 2025-03-04 16:43:33,282 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_20.jpg_r1_c1.png
7132
+ 2025-03-04 16:43:34,592 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_20.jpg_r2_c0.png
7133
+ 2025-03-04 16:43:36,080 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_20.jpg_r2_c1.png
7134
+ 2025-03-04 16:43:37,530 [WARNING] __main__ - Cell image not found: /tmp/tmp6_d2lvpn.jpg_rows/row_3/col_0.png
7135
+ 2025-03-04 16:43:37,531 [INFO] __main__ - Processing table image: /topic-extraction/img_21.jpg, columns=three
7136
+ 2025-03-04 16:43:40,529 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_21.jpg_r0_c0.png
7137
+ 2025-03-04 16:43:41,854 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_21.jpg_r1_c0.png
7138
+ 2025-03-04 16:43:43,415 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_21.jpg_r1_c1.png
7139
+ 2025-03-04 16:43:45,170 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_21.jpg_r2_c0.png
7140
+ 2025-03-04 16:43:46,291 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_21.jpg_r2_c0.png
7141
+ 2025-03-04 16:43:46,292 [INFO] __main__ - Processing table image: /topic-extraction/img_22.jpg, columns=three
7142
+ 2025-03-04 16:43:48,973 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_22.jpg_r0_c0.png
7143
+ 2025-03-04 16:43:50,109 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_22.jpg_r1_c0.png
7144
+ 2025-03-04 16:43:51,618 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_22.jpg_r1_c1.png
7145
+ 2025-03-04 16:43:52,724 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_22.jpg_r1_c1.png
7146
+ 2025-03-04 16:43:52,904 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_22.jpg_r2_c0.png
7147
+ 2025-03-04 16:43:54,163 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_22.jpg_r2_c0.png
7148
+ 2025-03-04 16:43:54,163 [INFO] __main__ - Processing table image: /topic-extraction/img_23.jpg, columns=three
7149
+ 2025-03-04 16:43:56,200 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_23.jpg_r0_c0.png
7150
+ 2025-03-04 16:43:57,589 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_23.jpg_r1_c0.png
7151
+ 2025-03-04 16:43:59,010 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_23.jpg_r1_c1.png
7152
+ 2025-03-04 16:44:00,106 [WARNING] __main__ - Cell image not found: /tmp/tmp5l7mn427.jpg_rows/row_2/col_0.png
7153
+ 2025-03-04 16:44:00,107 [INFO] __main__ - Processing table image: /topic-extraction/img_24.jpg, columns=three
7154
+ 2025-03-04 16:44:03,906 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_24.jpg_r0_c0.png
7155
+ 2025-03-04 16:44:05,120 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_24.jpg_r1_c0.png
7156
+ 2025-03-04 16:44:06,699 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_24.jpg_r1_c1.png
7157
+ 2025-03-04 16:44:08,013 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_24.jpg_r1_c1.png
7158
+ 2025-03-04 16:44:08,014 [WARNING] __main__ - Cell image not found: /tmp/tmp7f4e012v.jpg_rows/row_2/col_0.png
7159
+ 2025-03-04 16:44:08,019 [INFO] __main__ - Processing table image: /topic-extraction/img_25.jpg, columns=three
7160
+ 2025-03-04 16:44:10,562 [WARNING] __main__ - Cell image not found: /tmp/tmpsr1107vb.jpg_rows/row_0/col_0.png
7161
+ 2025-03-04 16:44:10,823 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_25.jpg_r1_c0.png
7162
+ 2025-03-04 16:44:12,067 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_25.jpg_r2_c0.png
7163
+ 2025-03-04 16:44:13,630 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_25.jpg_r2_c1.png
7164
+ 2025-03-04 16:44:15,001 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_25.jpg_r3_c0.png
7165
+ 2025-03-04 16:44:16,162 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_25.jpg_r3_c0.png
7166
+ 2025-03-04 16:44:16,163 [INFO] __main__ - Processing table image: /topic-extraction/img_26.jpg, columns=three
7167
+ 2025-03-04 16:44:18,257 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_26.jpg_r0_c0.png
7168
+ 2025-03-04 16:44:19,367 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_26.jpg_r1_c0.png
7169
+ 2025-03-04 16:44:20,866 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_26.jpg_r1_c1.png
7170
+ 2025-03-04 16:44:22,029 [WARNING] __main__ - Cell image not found: /tmp/tmpm4jr09co.jpg_rows/row_2/col_0.png
7171
+ 2025-03-04 16:44:22,030 [INFO] __main__ - Processing table image: /topic-extraction/img_27.jpg, columns=three
7172
+ 2025-03-04 16:44:25,458 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r0_c0.png
7173
+ 2025-03-04 16:44:26,636 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r1_c0.png
7174
+ 2025-03-04 16:44:28,117 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r1_c1.png
7175
+ 2025-03-04 16:44:29,316 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r2_c0.png
7176
+ 2025-03-04 16:44:30,892 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r2_c1.png
7177
+ 2025-03-04 16:44:32,031 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r3_c0.png
7178
+ 2025-03-04 16:44:32,983 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_27.jpg_r3_c0.png
7179
+ 2025-03-04 16:44:32,984 [INFO] __main__ - Processing table image: /topic-extraction/img_28.jpg, columns=three
7180
+ 2025-03-04 16:44:35,702 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_28.jpg_r0_c0.png
7181
+ 2025-03-04 16:44:37,077 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_28.jpg_r1_c0.png
7182
+ 2025-03-04 16:44:38,586 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_28.jpg_r1_c1.png
7183
+ 2025-03-04 16:44:40,000 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_28.jpg_r2_c0.png
7184
+ 2025-03-04 16:44:41,005 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_28.jpg_r2_c0.png
7185
+ 2025-03-04 16:44:41,006 [INFO] __main__ - Processing table image: /topic-extraction/img_29.jpg, columns=three
7186
+ 2025-03-04 16:44:42,801 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_29.jpg_r0_c0.png
7187
+ 2025-03-04 16:44:43,877 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_29.jpg_r1_c0.png
7188
+ 2025-03-04 16:44:45,297 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_29.jpg_r1_c1.png
7189
+ 2025-03-04 16:44:46,572 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_29.jpg_r2_c0.png
7190
+ 2025-03-04 16:44:47,560 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_29.jpg_r2_c0.png
7191
+ 2025-03-04 16:44:47,564 [INFO] __main__ - Final subtopics JSON saved locally at /home/user/app/pearson_json/_subtopics.json
7192
+ 2025-03-04 16:44:47,893 [INFO] __main__ - GPU memory cleaned up.
7193
+ 2025-03-04 16:44:47,898 [INFO] __main__ - Processing completed successfully.
7194
+ 2025-03-04 17:13:14,000 [INFO] __main__ - Processing PDF: /home/user/app/input_output/wjec-gce-as-a-economics-specification-from-2015.pdf
7195
+ 2025-03-04 17:13:14,813 [INFO] __main__ - Gemini returned subtopics: {'2.1AS units': [7, 22], '2.2A2 units': [23, 43]}
7196
+ 2025-03-04 17:13:14,814 [INFO] __main__ - Loaded 3543551 bytes from local file '/home/user/app/input_output/wjec-gce-as-a-economics-specification-from-2015.pdf'
7197
+ 2025-03-04 17:13:15,049 [INFO] __main__ - Computed global offset: 0
7198
+ 2025-03-04 17:13:15,049 [INFO] __main__ - Processing pages (0-based): [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42]
7199
+ 2025-03-04 17:13:55,840 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_1.jpg
7200
+ 2025-03-04 17:13:56,487 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_2.jpg
7201
+ 2025-03-04 17:13:56,943 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_3.jpg
7202
+ 2025-03-04 17:13:57,441 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_4.jpg
7203
+ 2025-03-04 17:13:57,816 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_5.jpg
7204
+ 2025-03-04 17:13:58,206 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_6.jpg
7205
+ 2025-03-04 17:13:58,724 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_7.jpg
7206
+ 2025-03-04 17:13:59,172 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_8.jpg
7207
+ 2025-03-04 17:13:59,579 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_9.jpg
7208
+ 2025-03-04 17:13:59,870 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_10.jpg
7209
+ 2025-03-04 17:14:00,375 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_11.jpg
7210
+ 2025-03-04 17:14:00,860 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_12.jpg
7211
+ 2025-03-04 17:14:01,418 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_13.jpg
7212
+ 2025-03-04 17:14:01,705 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_14.jpg
7213
+ 2025-03-04 17:14:02,299 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_15.jpg
7214
+ 2025-03-04 17:14:02,835 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_16.jpg
7215
+ 2025-03-04 17:14:03,343 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_17.jpg
7216
+ 2025-03-04 17:14:03,722 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_18.jpg
7217
+ 2025-03-04 17:14:04,256 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_19.jpg
7218
+ 2025-03-04 17:14:04,622 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_20.jpg
7219
+ 2025-03-04 17:14:05,155 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_21.jpg
7220
+ 2025-03-04 17:14:05,630 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_22.jpg
7221
+ 2025-03-04 17:14:06,016 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_23.jpg
7222
+ 2025-03-04 17:14:06,624 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_24.jpg
7223
+ 2025-03-04 17:14:07,057 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_25.jpg
7224
+ 2025-03-04 17:14:07,468 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_26.jpg
7225
+ 2025-03-04 17:14:08,013 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_27.jpg
7226
+ 2025-03-04 17:14:08,559 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_28.jpg
7227
+ 2025-03-04 17:14:08,944 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_29.jpg
7228
+ 2025-03-04 17:14:09,370 [INFO] __main__ - Classifying images to detect tables.
7229
+ 2025-03-04 17:14:13,356 [INFO] __main__ - Processing table image: /topic-extraction/img_1.jpg, columns=three
7230
+ 2025-03-04 17:14:16,548 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r0_c0.png
7231
+ 2025-03-04 17:14:17,824 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r1_c0.png
7232
+ 2025-03-04 17:14:19,207 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r1_c1.png
7233
+ 2025-03-04 17:14:20,785 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r2_c0.png
7234
+ 2025-03-04 17:14:22,337 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r2_c1.png
7235
+ 2025-03-04 17:14:24,117 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r3_c0.png
7236
+ 2025-03-04 17:14:25,468 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r3_c1.png
7237
+ 2025-03-04 17:14:26,797 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r4_c0.png
7238
+ 2025-03-04 17:14:27,715 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_1.jpg_r4_c0.png
7239
+ 2025-03-04 17:14:27,715 [INFO] __main__ - Processing table image: /topic-extraction/img_2.jpg, columns=three
7240
+ 2025-03-04 17:14:31,016 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r0_c0.png
7241
+ 2025-03-04 17:14:32,468 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r1_c0.png
7242
+ 2025-03-04 17:14:34,010 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r1_c1.png
7243
+ 2025-03-04 17:14:37,127 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r2_c0.png
7244
+ 2025-03-04 17:14:38,574 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r2_c1.png
7245
+ 2025-03-04 17:14:40,014 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r3_c0.png
7246
+ 2025-03-04 17:14:41,453 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r3_c1.png
7247
+ 2025-03-04 17:14:43,026 [WARNING] __main__ - Cell image not found: /tmp/tmpgz3m3b9n.jpg_rows/row_4/col_0.png
7248
+ 2025-03-04 17:14:43,026 [INFO] __main__ - Processing table image: /topic-extraction/img_3.jpg, columns=three
7249
+ 2025-03-04 17:14:45,066 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_3.jpg_r0_c0.png
7250
+ 2025-03-04 17:14:46,513 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_3.jpg_r1_c0.png
7251
+ 2025-03-04 17:14:48,054 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_3.jpg_r1_c1.png
7252
+ 2025-03-04 17:14:49,517 [WARNING] __main__ - Cell image not found: /tmp/tmpkn9damk4.jpg_rows/row_2/col_0.png
7253
+ 2025-03-04 17:14:49,518 [INFO] __main__ - Processing table image: /topic-extraction/img_4.jpg, columns=three
7254
+ 2025-03-04 17:14:51,857 [WARNING] __main__ - Cell image not found: /tmp/tmp_7v9cvwb.jpg_rows/row_0/col_0.png
7255
+ 2025-03-04 17:14:51,857 [WARNING] __main__ - Cell image not found: /tmp/tmp_7v9cvwb.jpg_rows/row_0/col_1.png
7256
+ 2025-03-04 17:14:52,128 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r1_c0.png
7257
+ 2025-03-04 17:14:53,246 [WARNING] __main__ - Cell image not found: /tmp/tmp_7v9cvwb.jpg_rows/row_1/col_1.png
7258
+ 2025-03-04 17:14:53,522 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r2_c0.png
7259
+ 2025-03-04 17:14:54,896 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r2_c1.png
7260
+ 2025-03-04 17:14:56,522 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r3_c0.png
7261
+ 2025-03-04 17:14:57,958 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r3_c1.png
7262
+ 2025-03-04 17:14:59,510 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r4_c0.png
7263
+ 2025-03-04 17:15:00,493 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_4.jpg_r4_c0.png
7264
+ 2025-03-04 17:15:00,494 [INFO] __main__ - Processing table image: /topic-extraction/img_5.jpg, columns=three
7265
+ 2025-03-04 17:15:01,571 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_5.jpg_r0_c0.png
7266
+ 2025-03-04 17:15:02,997 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_5.jpg_r1_c0.png
7267
+ 2025-03-04 17:15:04,424 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_5.jpg_r1_c1.png
7268
+ 2025-03-04 17:15:05,776 [WARNING] __main__ - Cell image not found: /tmp/tmpr68pawul.jpg_rows/row_2/col_0.png
7269
+ 2025-03-04 17:15:05,776 [INFO] __main__ - Processing table image: /topic-extraction/img_6.jpg, columns=three
7270
+ 2025-03-04 17:15:07,783 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_6.jpg_r0_c0.png
7271
+ 2025-03-04 17:15:09,073 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_6.jpg_r1_c0.png
7272
+ 2025-03-04 17:15:10,711 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_6.jpg_r1_c1.png
7273
+ 2025-03-04 17:15:12,507 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_6.jpg_r2_c0.png
7274
+ 2025-03-04 17:15:13,630 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_6.jpg_r2_c0.png
7275
+ 2025-03-04 17:15:13,631 [INFO] __main__ - Processing table image: /topic-extraction/img_7.jpg, columns=three
7276
+ 2025-03-04 17:15:16,878 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r0_c0.png
7277
+ 2025-03-04 17:15:18,344 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r1_c0.png
7278
+ 2025-03-04 17:15:19,949 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r1_c1.png
7279
+ 2025-03-04 17:15:22,552 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r2_c0.png
7280
+ 2025-03-04 17:15:23,888 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r2_c1.png
7281
+ 2025-03-04 17:15:25,222 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r3_c0.png
7282
+ 2025-03-04 17:15:26,200 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_7.jpg_r3_c0.png
7283
+ 2025-03-04 17:15:26,200 [INFO] __main__ - Processing table image: /topic-extraction/img_8.jpg, columns=three
7284
+ 2025-03-04 17:15:28,378 [WARNING] __main__ - Cell image not found: /tmp/tmpeauayzcm.jpg_rows/row_0/col_0.png
7285
+ 2025-03-04 17:15:28,378 [WARNING] __main__ - Cell image not found: /tmp/tmpeauayzcm.jpg_rows/row_0/col_1.png
7286
+ 2025-03-04 17:15:28,642 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r1_c0.png
7287
+ 2025-03-04 17:15:30,092 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r2_c0.png
7288
+ 2025-03-04 17:15:31,485 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r2_c1.png
7289
+ 2025-03-04 17:15:33,367 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r3_c0.png
7290
+ 2025-03-04 17:15:34,783 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r3_c1.png
7291
+ 2025-03-04 17:15:36,384 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r4_c0.png
7292
+ 2025-03-04 17:15:37,395 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_8.jpg_r4_c0.png
7293
+ 2025-03-04 17:15:37,396 [INFO] __main__ - Processing table image: /topic-extraction/img_9.jpg, columns=three
7294
+ 2025-03-04 17:15:39,469 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r0_c0.png
7295
+ 2025-03-04 17:15:40,865 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r1_c0.png
7296
+ 2025-03-04 17:15:42,177 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r1_c1.png
7297
+ 2025-03-04 17:15:43,748 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r2_c0.png
7298
+ 2025-03-04 17:15:45,111 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r2_c1.png
7299
+ 2025-03-04 17:15:47,334 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r3_c0.png
7300
+ 2025-03-04 17:15:48,513 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r3_c1.png
7301
+ 2025-03-04 17:15:49,748 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r4_c0.png
7302
+ 2025-03-04 17:15:50,582 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_9.jpg_r4_c0.png
7303
+ 2025-03-04 17:15:50,582 [INFO] __main__ - Processing table image: /topic-extraction/img_10.jpg, columns=three
7304
+ 2025-03-04 17:15:51,570 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_10.jpg_r0_c0.png
7305
+ 2025-03-04 17:15:53,041 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_10.jpg_r1_c0.png
7306
+ 2025-03-04 17:15:54,468 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_10.jpg_r1_c1.png
7307
+ 2025-03-04 17:15:55,844 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_10.jpg_r2_c0.png
7308
+ 2025-03-04 17:15:56,966 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_10.jpg_r2_c0.png
7309
+ 2025-03-04 17:15:56,967 [INFO] __main__ - Processing table image: /topic-extraction/img_11.jpg, columns=three
7310
+ 2025-03-04 17:15:59,374 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_11.jpg_r0_c0.png
7311
+ 2025-03-04 17:16:00,804 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_11.jpg_r1_c0.png
7312
+ 2025-03-04 17:16:02,580 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_11.jpg_r1_c1.png
7313
+ 2025-03-04 17:16:04,173 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_11.jpg_r2_c0.png
7314
+ 2025-03-04 17:16:05,062 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_11.jpg_r2_c0.png
7315
+ 2025-03-04 17:16:05,062 [INFO] __main__ - Processing table image: /topic-extraction/img_12.jpg, columns=three
7316
+ 2025-03-04 17:16:07,653 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_12.jpg_r0_c0.png
7317
+ 2025-03-04 17:16:09,201 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_12.jpg_r1_c0.png
7318
+ 2025-03-04 17:16:10,928 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_12.jpg_r1_c1.png
7319
+ 2025-03-04 17:16:12,739 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_12.jpg_r2_c0.png
7320
+ 2025-03-04 17:16:13,735 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_12.jpg_r2_c0.png
7321
+ 2025-03-04 17:16:13,735 [INFO] __main__ - Processing table image: /topic-extraction/img_13.jpg, columns=three
7322
+ 2025-03-04 17:16:16,756 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r0_c0.png
7323
+ 2025-03-04 17:16:17,689 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_13.jpg_r0_c0.png
7324
+ 2025-03-04 17:16:17,947 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r0_c1.png
7325
+ 2025-03-04 17:16:19,521 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r1_c0.png
7326
+ 2025-03-04 17:16:21,310 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r1_c1.png
7327
+ 2025-03-04 17:16:23,370 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r2_c0.png
7328
+ 2025-03-04 17:16:24,380 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_13.jpg_r2_c0.png
7329
+ 2025-03-04 17:16:24,634 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r3_c0.png
7330
+ 2025-03-04 17:16:26,009 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r3_c1.png
7331
+ 2025-03-04 17:16:27,859 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r4_c0.png
7332
+ 2025-03-04 17:16:28,943 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_13.jpg_r4_c0.png
7333
+ 2025-03-04 17:16:28,943 [INFO] __main__ - Processing table image: /topic-extraction/img_14.jpg, columns=three
7334
+ 2025-03-04 17:16:30,062 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_14.jpg_r0_c0.png
7335
+ 2025-03-04 17:16:31,485 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_14.jpg_r1_c0.png
7336
+ 2025-03-04 17:16:32,831 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_14.jpg_r1_c1.png
7337
+ 2025-03-04 17:16:34,357 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_14.jpg_r2_c0.png
7338
+ 2025-03-04 17:16:35,420 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_14.jpg_r2_c0.png
7339
+ 2025-03-04 17:16:35,420 [INFO] __main__ - Processing table image: /topic-extraction/img_15.jpg, columns=three
7340
+ 2025-03-04 17:16:38,997 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r0_c0.png
7341
+ 2025-03-04 17:16:40,384 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r1_c0.png
7342
+ 2025-03-04 17:16:42,086 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r1_c1.png
7343
+ 2025-03-04 17:16:43,960 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r2_c0.png
7344
+ 2025-03-04 17:16:45,362 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r2_c1.png
7345
+ 2025-03-04 17:16:47,152 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r3_c0.png
7346
+ 2025-03-04 17:16:48,540 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r3_c1.png
7347
+ 2025-03-04 17:16:49,983 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r4_c0.png
7348
+ 2025-03-04 17:16:51,054 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_15.jpg_r4_c0.png
7349
+ 2025-03-04 17:16:51,054 [INFO] __main__ - Processing table image: /topic-extraction/img_16.jpg, columns=three
7350
+ 2025-03-04 17:16:54,130 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r0_c0.png
7351
+ 2025-03-04 17:16:55,482 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r1_c0.png
7352
+ 2025-03-04 17:16:56,770 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r1_c1.png
7353
+ 2025-03-04 17:16:58,649 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r2_c0.png
7354
+ 2025-03-04 17:16:59,944 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r2_c1.png
7355
+ 2025-03-04 17:17:01,397 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r3_c0.png
7356
+ 2025-03-04 17:17:02,716 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r3_c1.png
7357
+ 2025-03-04 17:17:04,306 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r4_c0.png
7358
+ 2025-03-04 17:17:05,735 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r4_c1.png
7359
+ 2025-03-04 17:17:07,796 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r5_c0.png
7360
+ 2025-03-04 17:17:08,712 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_16.jpg_r5_c0.png
7361
+ 2025-03-04 17:17:08,712 [INFO] __main__ - Processing table image: /topic-extraction/img_17.jpg, columns=three
7362
+ 2025-03-04 17:17:11,791 [WARNING] __main__ - Cell image not found: /tmp/tmp8t199g9l.jpg_rows/row_0/col_0.png
7363
+ 2025-03-04 17:17:12,053 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r1_c0.png
7364
+ 2025-03-04 17:17:13,388 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r2_c0.png
7365
+ 2025-03-04 17:17:14,808 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r2_c1.png
7366
+ 2025-03-04 17:17:16,828 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r3_c0.png
7367
+ 2025-03-04 17:17:18,305 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r3_c1.png
7368
+ 2025-03-04 17:17:20,126 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r4_c0.png
7369
+ 2025-03-04 17:17:21,082 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_17.jpg_r4_c0.png
7370
+ 2025-03-04 17:17:21,082 [INFO] __main__ - Processing table image: /topic-extraction/img_18.jpg, columns=three
7371
+ 2025-03-04 17:17:22,864 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r0_c0.png
7372
+ 2025-03-04 17:17:24,349 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r1_c0.png
7373
+ 2025-03-04 17:17:25,674 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r1_c1.png
7374
+ 2025-03-04 17:17:27,576 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r2_c0.png
7375
+ 2025-03-04 17:17:28,875 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r2_c1.png
7376
+ 2025-03-04 17:17:30,338 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r3_c0.png
7377
+ 2025-03-04 17:17:31,459 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_18.jpg_r3_c0.png
7378
+ 2025-03-04 17:17:31,460 [INFO] __main__ - Processing table image: /topic-extraction/img_19.jpg, columns=three
7379
+ 2025-03-04 17:17:33,895 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r0_c0.png
7380
+ 2025-03-04 17:17:35,505 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r1_c0.png
7381
+ 2025-03-04 17:17:36,920 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r1_c1.png
7382
+ 2025-03-04 17:17:38,707 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r2_c0.png
7383
+ 2025-03-04 17:17:40,159 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r2_c1.png
7384
+ 2025-03-04 17:17:42,150 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r3_c0.png
7385
+ 2025-03-04 17:17:43,069 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_19.jpg_r3_c0.png
7386
+ 2025-03-04 17:17:43,069 [INFO] __main__ - Processing table image: /topic-extraction/img_20.jpg, columns=three
7387
+ 2025-03-04 17:17:44,770 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_20.jpg_r0_c0.png
7388
+ 2025-03-04 17:17:46,112 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_20.jpg_r1_c0.png
7389
+ 2025-03-04 17:17:47,369 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_20.jpg_r1_c1.png
7390
+ 2025-03-04 17:17:48,764 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_20.jpg_r2_c0.png
7391
+ 2025-03-04 17:17:50,279 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_20.jpg_r2_c1.png
7392
+ 2025-03-04 17:17:52,008 [WARNING] __main__ - Cell image not found: /tmp/tmpyuhd9sl8.jpg_rows/row_3/col_0.png
7393
+ 2025-03-04 17:17:52,009 [INFO] __main__ - Processing table image: /topic-extraction/img_21.jpg, columns=three
7394
+ 2025-03-04 17:17:54,856 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_21.jpg_r0_c0.png
7395
+ 2025-03-04 17:17:56,238 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_21.jpg_r1_c0.png
7396
+ 2025-03-04 17:17:58,121 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_21.jpg_r1_c1.png
7397
+ 2025-03-04 17:18:00,408 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_21.jpg_r2_c0.png
7398
+ 2025-03-04 17:18:01,418 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_21.jpg_r2_c0.png
7399
+ 2025-03-04 17:18:01,418 [INFO] __main__ - Processing table image: /topic-extraction/img_22.jpg, columns=three
7400
+ 2025-03-04 17:18:03,917 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_22.jpg_r0_c0.png
7401
+ 2025-03-04 17:18:05,292 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_22.jpg_r1_c0.png
7402
+ 2025-03-04 17:18:07,082 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_22.jpg_r1_c1.png
7403
+ 2025-03-04 17:18:08,934 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_22.jpg_r2_c0.png
7404
+ 2025-03-04 17:18:10,012 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_22.jpg_r2_c0.png
7405
+ 2025-03-04 17:18:10,012 [INFO] __main__ - Processing table image: /topic-extraction/img_23.jpg, columns=three
7406
+ 2025-03-04 17:18:11,952 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_23.jpg_r0_c0.png
7407
+ 2025-03-04 17:18:13,275 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_23.jpg_r1_c0.png
7408
+ 2025-03-04 17:18:14,714 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_23.jpg_r1_c1.png
7409
+ 2025-03-04 17:18:16,140 [WARNING] __main__ - Cell image not found: /tmp/tmp91opcy4g.jpg_rows/row_2/col_0.png
7410
+ 2025-03-04 17:18:16,140 [INFO] __main__ - Processing table image: /topic-extraction/img_24.jpg, columns=three
7411
+ 2025-03-04 17:18:19,748 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_24.jpg_r0_c0.png
7412
+ 2025-03-04 17:18:21,092 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_24.jpg_r1_c0.png
7413
+ 2025-03-04 17:18:23,324 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_24.jpg_r1_c1.png
7414
+ 2025-03-04 17:18:26,880 [WARNING] __main__ - Cell image not found: /tmp/tmpigalpv91.jpg_rows/row_2/col_0.png
7415
+ 2025-03-04 17:18:26,880 [INFO] __main__ - Processing table image: /topic-extraction/img_25.jpg, columns=three
7416
+ 2025-03-04 17:18:29,208 [WARNING] __main__ - Cell image not found: /tmp/tmppaoedyal.jpg_rows/row_0/col_0.png
7417
+ 2025-03-04 17:18:29,475 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_25.jpg_r1_c0.png
7418
+ 2025-03-04 17:18:30,947 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_25.jpg_r2_c0.png
7419
+ 2025-03-04 17:18:33,064 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_25.jpg_r2_c1.png
7420
+ 2025-03-04 17:18:36,316 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_25.jpg_r3_c0.png
7421
+ 2025-03-04 17:18:37,482 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_25.jpg_r3_c0.png
7422
+ 2025-03-04 17:18:37,483 [INFO] __main__ - Processing table image: /topic-extraction/img_26.jpg, columns=three
7423
+ 2025-03-04 17:18:39,543 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_26.jpg_r0_c0.png
7424
+ 2025-03-04 17:18:40,901 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_26.jpg_r1_c0.png
7425
+ 2025-03-04 17:18:42,749 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_26.jpg_r1_c1.png
7426
+ 2025-03-04 17:18:44,332 [WARNING] __main__ - Cell image not found: /tmp/tmp22n5c_8q.jpg_rows/row_2/col_0.png
7427
+ 2025-03-04 17:18:44,332 [INFO] __main__ - Processing table image: /topic-extraction/img_27.jpg, columns=three
7428
+ 2025-03-04 17:18:47,634 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r0_c0.png
7429
+ 2025-03-04 17:18:49,048 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r1_c0.png
7430
+ 2025-03-04 17:18:50,572 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r1_c1.png
7431
+ 2025-03-04 17:18:52,196 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r2_c0.png
7432
+ 2025-03-04 17:18:53,636 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r2_c1.png
7433
+ 2025-03-04 17:18:55,054 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r3_c0.png
7434
+ 2025-03-04 17:18:56,002 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_27.jpg_r3_c0.png
7435
+ 2025-03-04 17:18:56,003 [INFO] __main__ - Processing table image: /topic-extraction/img_28.jpg, columns=three
7436
+ 2025-03-04 17:18:58,520 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_28.jpg_r0_c0.png
7437
+ 2025-03-04 17:18:59,970 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_28.jpg_r1_c0.png
7438
+ 2025-03-04 17:19:01,773 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_28.jpg_r1_c1.png
7439
+ 2025-03-04 17:19:03,587 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_28.jpg_r2_c0.png
7440
+ 2025-03-04 17:19:04,755 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_28.jpg_r2_c0.png
7441
+ 2025-03-04 17:19:04,755 [INFO] __main__ - Processing table image: /topic-extraction/img_29.jpg, columns=three
7442
+ 2025-03-04 17:19:06,526 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_29.jpg_r0_c0.png
7443
+ 2025-03-04 17:19:07,817 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_29.jpg_r1_c0.png
7444
+ 2025-03-04 17:19:09,284 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_29.jpg_r1_c1.png
7445
+ 2025-03-04 17:19:10,915 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_29.jpg_r2_c0.png
7446
+ 2025-03-04 17:19:11,969 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_29.jpg_r2_c0.png
7447
+ 2025-03-04 17:19:11,972 [INFO] __main__ - Final subtopics JSON saved locally at /home/user/app/pearson_json/_subtopics.json
7448
+ 2025-03-04 17:19:12,278 [INFO] __main__ - GPU memory cleaned up.
7449
+ 2025-03-04 17:19:12,283 [INFO] __main__ - Processing completed successfully.
7450
+ 2025-03-04 17:28:37,803 [INFO] __main__ - Processing PDF: /home/user/app/input_output/wjec-gce-as-a-economics-specification-from-2015.pdf
7451
+ 2025-03-04 17:28:38,622 [INFO] __main__ - Gemini returned subtopics: {'2.1AS units': [7, 22], '2.2A2 units': [23, 43]}
7452
+ 2025-03-04 17:28:38,624 [INFO] __main__ - Loaded 3543551 bytes from local file '/home/user/app/input_output/wjec-gce-as-a-economics-specification-from-2015.pdf'
7453
+ 2025-03-04 17:28:38,859 [INFO] __main__ - Computed global offset: 0
7454
+ 2025-03-04 17:28:38,860 [INFO] __main__ - Processing pages (0-based): [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42]
7455
+ 2025-03-04 17:29:19,633 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_1.jpg
7456
+ 2025-03-04 17:29:20,237 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_2.jpg
7457
+ 2025-03-04 17:29:20,620 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_3.jpg
7458
+ 2025-03-04 17:29:21,124 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_4.jpg
7459
+ 2025-03-04 17:29:21,413 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_5.jpg
7460
+ 2025-03-04 17:29:21,792 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_6.jpg
7461
+ 2025-03-04 17:29:22,350 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_7.jpg
7462
+ 2025-03-04 17:29:22,827 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_8.jpg
7463
+ 2025-03-04 17:29:23,260 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_9.jpg
7464
+ 2025-03-04 17:29:23,574 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_10.jpg
7465
+ 2025-03-04 17:29:24,083 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_11.jpg
7466
+ 2025-03-04 17:29:24,602 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_12.jpg
7467
+ 2025-03-04 17:29:25,141 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_13.jpg
7468
+ 2025-03-04 17:29:25,442 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_14.jpg
7469
+ 2025-03-04 17:29:26,082 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_15.jpg
7470
+ 2025-03-04 17:29:26,668 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_16.jpg
7471
+ 2025-03-04 17:29:27,176 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_17.jpg
7472
+ 2025-03-04 17:29:27,575 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_18.jpg
7473
+ 2025-03-04 17:29:28,110 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_19.jpg
7474
+ 2025-03-04 17:29:28,509 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_20.jpg
7475
+ 2025-03-04 17:29:29,046 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_21.jpg
7476
+ 2025-03-04 17:29:29,553 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_22.jpg
7477
+ 2025-03-04 17:29:29,936 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_23.jpg
7478
+ 2025-03-04 17:29:30,523 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_24.jpg
7479
+ 2025-03-04 17:29:31,034 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_25.jpg
7480
+ 2025-03-04 17:29:31,417 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_26.jpg
7481
+ 2025-03-04 17:29:31,991 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_27.jpg
7482
+ 2025-03-04 17:29:32,506 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_28.jpg
7483
+ 2025-03-04 17:29:32,884 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_29.jpg
7484
+ 2025-03-04 17:29:33,308 [INFO] __main__ - Classifying images to detect tables.
7485
+ 2025-03-04 17:59:52,883 [INFO] __main__ - GPU memory cleaned up.
topic_extraction.py CHANGED
@@ -299,95 +299,113 @@ def call_gemini_for_subtopic_identification_image(image_data: bytes, api_key: st
299
  for attempt in range(max_retries + 1):
300
  try:
301
  prompt = """
302
- You are given an image from an educational curriculum specification. The image may contain:
303
  1) A main topic heading in the format: "<number> <Topic Name>", for example "2 Algebra and functions continued".
304
- 2) A subtopic heading in the format "<number>.<number>", for example "2.5", "2.6", or "3.4".
305
  3) A label-like title in the left column of a two-column table, for example "G2", "G3", "Scarcity, choice and opportunity cost", or similar text without explicit numeric patterns (2.1, 3.4, etc.).
306
- 4) Possibly no relevant text at all.
307
 
308
  Your task is to extract:
309
  - **"title"**: A recognized main topic or heading text.
310
- - **"subtopics"**: Any recognized subtopic numbers (e.g. "2.5", "2.6", "3.4"), as an array of strings.
311
 
312
  Follow these rules:
313
 
314
- (1) **If the cell shows a main topic in the format "<number> <Topic Name>",** for example "2 Algebra and functions continued", (remove the word "continued") then:
315
- - Put that text (without the word "continued") in "title". (e.g. "2 Algebra and functions")
316
- - "subtopics" should be an empty array, unless you also see smaller subtopic numbers.
 
317
 
318
- (2) **If the cell shows one or more subtopic numbers** in the format "<number>.<number>", for example "2.5", "2.6", or "3.4", then:
319
  - Collect those exact strings in the JSON key "subtopics" (an array of strings).
320
- - "title" in this case should be an empty string if you only detect subtopics.
321
  (Example: If text is "2.5 Solve linear inequalities...", then "title" = "", "subtopics" = ["2.5"]).
322
 
323
- (3) If no main topic or subtopic is detected but the text appears to be a heading (e.g. "Scarcity, choice and opportunity cost"), return:
324
- {{
325
- "title": "",
326
- "subtopics": []
327
- }}
 
328
 
329
  (4) **If there is no numeric value in the left column** (e.g. "2.1" or "2 <Topic name>" not found) but the left column text appears to be a heading (for instance "Scarcity, choice and opportunity cost"), then:
330
- - Use the **left column text** as "title".
331
  - "subtopics" remains empty.
332
- Example:
333
  If the left column is "Scarcity, choice and opportunity cost" and the right column has definitions, your output is:
334
  {
335
  "title": "Scarcity, choice and opportunity cost",
336
  "subtopics": []
337
  }
338
 
339
- (5) **If there is a character + digit pattern** in the left column for a two-column table (for example "G2", "G3", "G4", "C1"), treat that as a topic-like label:
 
 
 
 
 
 
 
 
 
340
  - Put that label text into "title" (e.g. "G2").
341
  - "subtopics" remains empty unless you also see actual subtopic formats like "2.5", "3.4" inside the same cell.
342
 
343
- (6) **Output must be valid JSON** in this exact structure, with no extra text or explanation:
344
  {
345
  "title": "...",
346
  "subtopics": [...]
347
  }
348
 
349
- (7) If the image is blank or truncated, defined as:
350
- - Contains no words at all (e.g. a blank white or black image)
351
- - Contains only a truncated snippet of words such as "Topics", "What students need to learn" with blue background
352
- - Contains a truncated snippet with words like "Topics", "What students need to learn", "Content" with gray background (RGB (166,166,166) or (180,180,180)) then return:
353
- {{
354
- "title": "EMPTY_IMAGE",
355
- "subtopics": []
356
- }}
 
 
 
 
 
 
 
 
357
 
358
  **Examples**:
359
 
360
- - If the image text is `"2 Algebra and functions continued"`, return:
361
  {
362
  "title": "2 Algebra and functions",
363
  "subtopics": []
364
  }
365
 
366
- - If the image text is `"2.5 Solve linear and quadratic inequalities ..."`, return:
367
  {
368
  "title": "",
369
  "subtopics": ["2.5"]
370
  }
371
 
372
- - If the image text is `"Scarcity, choice and opportunity cost"` (with no numeric patterns at all), return:
373
  {
374
- "title": "Scarcity, choice and opportunity cost",
375
  "subtopics": []
376
  }
377
 
378
- - If the left column says `"G2"` and the right column has details, but no subtopic numbers, return:
379
  {
380
  "title": "G2",
381
  "subtopics": []
382
  }
383
 
384
- - If you cannot recognize any text matching these patterns, or if nothing is found, return:
385
  {
386
- "title": "",
387
  "subtopics": []
388
  }
389
  """
390
-
391
  global _GEMINI_CLIENT
392
  if _GEMINI_CLIENT is None:
393
  _GEMINI_CLIENT = genai.Client(api_key=api_key)
@@ -482,7 +500,6 @@ class S3ImageWriter(DataWriter):
482
  elif cls == "THREE_COLUMN":
483
  info['final_alt'] = "HAS TO BE PROCESSED - three column table"
484
  elif cls == "EMPTY_IMAGE":
485
- # Remove markdown reference, delete from descriptions and S3.
486
  md_content = md_content.replace(f"![]({key}{p})", "")
487
  try:
488
  self.s3_writer.delete(info['s3_path'])
@@ -865,7 +882,6 @@ class MineruNoTextProcessor:
865
  def process(self, pdf_path: str) -> Dict[str, Any]:
866
  logger.info(f"Processing PDF: {pdf_path}")
867
  try:
868
- # Possibly call subtopic_extractor on first pages to find subtopics in the PDF as a whole
869
  subtopics = self.subtopic_extractor.extract_subtopics(pdf_path)
870
  logger.info(f"Gemini returned subtopics: {subtopics}")
871
 
 
299
  for attempt in range(max_retries + 1):
300
  try:
301
  prompt = """
302
+ You are given an image from an educational curriculum specification for Gemini Flash 2. The image may contain:
303
  1) A main topic heading in the format: "<number> <Topic Name>", for example "2 Algebra and functions continued".
304
+ 2) A subtopic heading in the format "<number>.<number>" or "<number>.<number>.<number>", for example "2.5", "2.6", "3.4", "2.1.1", "4.3.3" or "1.2.1".
305
  3) A label-like title in the left column of a two-column table, for example "G2", "G3", "Scarcity, choice and opportunity cost", or similar text without explicit numeric patterns (2.1, 3.4, etc.).
306
+ 4) Possibly no relevant text or only truncated text (e.g. "Topics", "Subject content", "What students need to learn", "Content Amplification Additional guidance notes", etc.).
307
 
308
  Your task is to extract:
309
  - **"title"**: A recognized main topic or heading text.
310
+ - **"subtopics"**: Any recognized subtopic numbers (e.g. "2.5", "2.6", "3.4", "G2", "2.1.1", "4.1.1"), as an array of strings.
311
 
312
  Follow these rules:
313
 
314
+ (1) **If the cell shows a main topic in the format "<number> <Topic Name>",** for example "2 Algebra and functions continued":
315
+ - Remove the word "continued" if present.
316
+ - Put that resulting text in "title". (e.g. "2 Algebra and functions")
317
+ - "subtopics" should be an empty array, unless smaller subtopic numbers (e.g. "2.5") are also detected in the same text.
318
 
319
+ (2) **If the cell shows one or more subtopic numbers** in the format "<number>.<number>", for example "2.5", "2.6", or "3.4":
320
  - Collect those exact strings in the JSON key "subtopics" (an array of strings).
321
+ - "title" in this case should be an empty string if you only detect subtopics.
322
  (Example: If text is "2.5 Solve linear inequalities...", then "title" = "", "subtopics" = ["2.5"]).
323
 
324
+ (3) **If no main topic or subtopic is detected but the text appears to be a heading**, for example "Specialisation, division of labour and exchange", then:
325
+ - Return:
326
+ {
327
+ "title": "<the heading text>",
328
+ "subtopics": []
329
+ }
330
 
331
  (4) **If there is no numeric value in the left column** (e.g. "2.1" or "2 <Topic name>" not found) but the left column text appears to be a heading (for instance "Scarcity, choice and opportunity cost"), then:
332
+ - Use that left column text as "title".
333
  - "subtopics" remains empty.
334
+ Example:
335
  If the left column is "Scarcity, choice and opportunity cost" and the right column has definitions, your output is:
336
  {
337
  "title": "Scarcity, choice and opportunity cost",
338
  "subtopics": []
339
  }
340
 
341
+ (5) **If there is no numeric value in the left column** (e.g. "2.1" or "2 <Topic name>" not found) or it appears to be a standalone column with text, treat it as a heading.
342
+ - "subtopics" remains empty.
343
+ Example:
344
+ If there is only one column image that is "Specialisation, devision of labour and exchange" and the right column is not present, your output is:
345
+ {
346
+ "title": "Specialisation, devision of labour and exchange",
347
+ "subtopics": []
348
+ }
349
+
350
+ (6) **If there is a character + digit pattern** in the left column of a two-column table (for example "G2", "G3", "G4", "C1"), treat that as a topic-like label:
351
  - Put that label text into "title" (e.g. "G2").
352
  - "subtopics" remains empty unless you also see actual subtopic formats like "2.5", "3.4" inside the same cell.
353
 
354
+ (7) **Output must be valid JSON** in this exact structure, with no extra text or explanation:
355
  {
356
  "title": "...",
357
  "subtopics": [...]
358
  }
359
 
360
+ (8) **If the image is blank or truncated**, defined as:
361
+ - Contains no words at all (e.g. a blank white or black image), **OR**
362
+ - Contains only snippet words/phrases such as "Topics", "Subject content", "Content Amplification Additional guidance notes", "What students need to learn" (including variations in background color), **OR**
363
+ - Contains partial headings with no recognizable numeric or textual headings
364
+ - Contains partial UI labels only, such as “Topics” in a gray bar or “What students need to learn” in a blue bar, with no additional meaningful text.
365
+ then return:
366
+ {
367
+ "title": "EMPTY_IMAGE",
368
+ "subtopics": []
369
+ }
370
+
371
+ (9) **If you cannot recognize any text matching the patterns above**, or the text is too partial/truncated to form a valid heading, also return:
372
+ {
373
+ "title": "EMPTY_IMAGE",
374
+ "subtopics": []
375
+ }
376
 
377
  **Examples**:
378
 
379
+ - If the image text is "2 Algebra and functions continued", return:
380
  {
381
  "title": "2 Algebra and functions",
382
  "subtopics": []
383
  }
384
 
385
+ - If the image text is "2.5 Solve linear and quadratic inequalities ...", return:
386
  {
387
  "title": "",
388
  "subtopics": ["2.5"]
389
  }
390
 
391
+ - If the image text is "Specialisation, division of labour and exchange" (with no numeric patterns at all), return:
392
  {
393
+ "title": "Specialisation, division of labour and exchange",
394
  "subtopics": []
395
  }
396
 
397
+ - If the left column says "G2" and the right column has details, but no subtopic numbers, return:
398
  {
399
  "title": "G2",
400
  "subtopics": []
401
  }
402
 
403
+ - If the image is blank or shows only partial/truncated snippet words (e.g. "Topics", "Content Amplification Additional guidance notes", "Subject content", "What students need to learn") and nothing else, return:
404
  {
405
+ "title": "EMPTY_IMAGE",
406
  "subtopics": []
407
  }
408
  """
 
409
  global _GEMINI_CLIENT
410
  if _GEMINI_CLIENT is None:
411
  _GEMINI_CLIENT = genai.Client(api_key=api_key)
 
500
  elif cls == "THREE_COLUMN":
501
  info['final_alt'] = "HAS TO BE PROCESSED - three column table"
502
  elif cls == "EMPTY_IMAGE":
 
503
  md_content = md_content.replace(f"![]({key}{p})", "")
504
  try:
505
  self.s3_writer.delete(info['s3_path'])
 
882
  def process(self, pdf_path: str) -> Dict[str, Any]:
883
  logger.info(f"Processing PDF: {pdf_path}")
884
  try:
 
885
  subtopics = self.subtopic_extractor.extract_subtopics(pdf_path)
886
  logger.info(f"Gemini returned subtopics: {subtopics}")
887
 
worker.py CHANGED
@@ -139,6 +139,7 @@ class RabbitMQWorker:
139
  try:
140
  pdf_url = file.get("url")
141
  logger.info("[Worker %s] Processing topic extraction for URL: %s", thread_id, pdf_url)
 
142
  result = self.topic_processor.process(pdf_url)
143
  context = {
144
  "key": file.get("key", ""),
 
139
  try:
140
  pdf_url = file.get("url")
141
  logger.info("[Worker %s] Processing topic extraction for URL: %s", thread_id, pdf_url)
142
+
143
  result = self.topic_processor.process(pdf_url)
144
  context = {
145
  "key": file.get("key", ""),