fix RabbitMQ
Browse files- __pycache__/topic_extraction.cpython-310.pyc +0 -0
- p_ms.PDF +0 -0
- pearson_json/_subtopics.json +39 -117
- topic_extraction.log +552 -0
- topic_extraction.py +52 -36
- worker.py +1 -0
__pycache__/topic_extraction.cpython-310.pyc
CHANGED
Binary files a/__pycache__/topic_extraction.cpython-310.pyc and b/__pycache__/topic_extraction.cpython-310.pyc differ
|
|
p_ms.PDF
DELETED
Binary file (291 kB)
|
|
pearson_json/_subtopics.json
CHANGED
@@ -6,6 +6,10 @@
|
|
6 |
"type": "image",
|
7 |
"key": "/topic-extraction/cells/img_1.jpg_r0_c0.png"
|
8 |
},
|
|
|
|
|
|
|
|
|
9 |
{
|
10 |
"type": "image",
|
11 |
"key": "/topic-extraction/cells/img_3.jpg_r0_c0.png"
|
@@ -24,177 +28,95 @@
|
|
24 |
},
|
25 |
{
|
26 |
"type": "image",
|
27 |
-
"key": "/topic-extraction/cells/
|
28 |
},
|
29 |
{
|
30 |
"type": "image",
|
31 |
-
"key": "/topic-extraction/cells/
|
32 |
},
|
33 |
{
|
34 |
"type": "image",
|
35 |
-
"key": "/topic-extraction/cells/
|
36 |
},
|
37 |
{
|
38 |
"type": "image",
|
39 |
-
"key": "/topic-extraction/cells/
|
40 |
},
|
41 |
{
|
42 |
"type": "image",
|
43 |
-
"key": "/topic-extraction/cells/
|
44 |
},
|
45 |
{
|
46 |
"type": "image",
|
47 |
-
"key": "/topic-extraction/cells/
|
48 |
},
|
49 |
{
|
50 |
"type": "image",
|
51 |
-
"key": "/topic-extraction/cells/
|
52 |
},
|
53 |
{
|
54 |
"type": "image",
|
55 |
-
"key": "/topic-extraction/cells/
|
56 |
},
|
57 |
{
|
58 |
"type": "image",
|
59 |
-
"key": "/topic-extraction/cells/
|
60 |
},
|
61 |
{
|
62 |
"type": "image",
|
63 |
-
"key": "/topic-extraction/cells/
|
64 |
},
|
65 |
{
|
66 |
"type": "image",
|
67 |
-
"key": "/topic-extraction/cells/
|
68 |
-
}
|
69 |
-
],
|
70 |
-
"children": []
|
71 |
-
},
|
72 |
-
{
|
73 |
-
"title": "Factors influencing demand and supply in product markets",
|
74 |
-
"contents": [
|
75 |
-
{
|
76 |
-
"type": "image",
|
77 |
-
"key": "/topic-extraction/cells/img_2.jpg_r1_c0.png"
|
78 |
-
}
|
79 |
-
],
|
80 |
-
"children": []
|
81 |
-
},
|
82 |
-
{
|
83 |
-
"title": "Why and how governments intervene in markets",
|
84 |
-
"contents": [
|
85 |
{
|
86 |
"type": "image",
|
87 |
-
"key": "/topic-extraction/cells/
|
88 |
-
}
|
89 |
-
],
|
90 |
-
"children": []
|
91 |
-
},
|
92 |
-
{
|
93 |
-
"title": "The circular flow of income model",
|
94 |
-
"contents": [
|
95 |
{
|
96 |
"type": "image",
|
97 |
-
"key": "/topic-extraction/cells/
|
98 |
-
}
|
99 |
-
],
|
100 |
-
"children": []
|
101 |
-
},
|
102 |
-
{
|
103 |
-
"title": "Government policy objectives",
|
104 |
-
"contents": [
|
105 |
{
|
106 |
"type": "image",
|
107 |
-
"key": "/topic-extraction/cells/
|
108 |
-
}
|
109 |
-
],
|
110 |
-
"children": []
|
111 |
-
},
|
112 |
-
{
|
113 |
-
"title": "Fiscal policy",
|
114 |
-
"contents": [
|
115 |
{
|
116 |
"type": "image",
|
117 |
-
"key": "/topic-extraction/cells/
|
118 |
-
}
|
119 |
-
],
|
120 |
-
"children": []
|
121 |
-
},
|
122 |
-
{
|
123 |
-
"title": "Monetary policy",
|
124 |
-
"contents": [
|
125 |
{
|
126 |
"type": "image",
|
127 |
-
"key": "/topic-extraction/cells/
|
128 |
-
}
|
129 |
-
],
|
130 |
-
"children": []
|
131 |
-
},
|
132 |
-
{
|
133 |
-
"title": "Exchange rates and exchange rate policy",
|
134 |
-
"contents": [
|
135 |
{
|
136 |
"type": "image",
|
137 |
-
"key": "/topic-extraction/cells/
|
138 |
-
}
|
139 |
-
],
|
140 |
-
"children": []
|
141 |
-
},
|
142 |
-
{
|
143 |
-
"title": "Free trade and protectionism",
|
144 |
-
"contents": [
|
145 |
{
|
146 |
"type": "image",
|
147 |
-
"key": "/topic-extraction/cells/
|
148 |
-
}
|
149 |
-
],
|
150 |
-
"children": []
|
151 |
-
},
|
152 |
-
{
|
153 |
-
"title": "Monopoly",
|
154 |
-
"contents": [
|
155 |
{
|
156 |
"type": "image",
|
157 |
-
"key": "/topic-extraction/cells/
|
158 |
-
}
|
159 |
-
],
|
160 |
-
"children": []
|
161 |
-
},
|
162 |
-
{
|
163 |
-
"title": "Economic growth",
|
164 |
-
"contents": [
|
165 |
{
|
166 |
"type": "image",
|
167 |
-
"key": "/topic-extraction/cells/
|
168 |
-
}
|
169 |
-
],
|
170 |
-
"children": []
|
171 |
-
},
|
172 |
-
{
|
173 |
-
"title": "Inflation and deflation",
|
174 |
-
"contents": [
|
175 |
{
|
176 |
"type": "image",
|
177 |
-
"key": "/topic-extraction/cells/
|
178 |
-
}
|
179 |
-
],
|
180 |
-
"children": []
|
181 |
-
},
|
182 |
-
{
|
183 |
-
"title": "The balance of payments",
|
184 |
-
"contents": [
|
185 |
{
|
186 |
"type": "image",
|
187 |
-
"key": "/topic-extraction/cells/
|
188 |
-
}
|
189 |
-
],
|
190 |
-
"children": []
|
191 |
-
},
|
192 |
-
{
|
193 |
-
"title": "Control of the national (public sector) debt",
|
194 |
-
"contents": [
|
195 |
{
|
196 |
"type": "image",
|
197 |
-
"key": "/topic-extraction/cells/
|
198 |
}
|
199 |
],
|
200 |
"children": []
|
|
|
6 |
"type": "image",
|
7 |
"key": "/topic-extraction/cells/img_1.jpg_r0_c0.png"
|
8 |
},
|
9 |
+
{
|
10 |
+
"type": "image",
|
11 |
+
"key": "/topic-extraction/cells/img_2.jpg_r0_c0.png"
|
12 |
+
},
|
13 |
{
|
14 |
"type": "image",
|
15 |
"key": "/topic-extraction/cells/img_3.jpg_r0_c0.png"
|
|
|
28 |
},
|
29 |
{
|
30 |
"type": "image",
|
31 |
+
"key": "/topic-extraction/cells/img_7.jpg_r0_c0.png"
|
32 |
},
|
33 |
{
|
34 |
"type": "image",
|
35 |
+
"key": "/topic-extraction/cells/img_8.jpg_r1_c0.png"
|
36 |
},
|
37 |
{
|
38 |
"type": "image",
|
39 |
+
"key": "/topic-extraction/cells/img_9.jpg_r0_c0.png"
|
40 |
},
|
41 |
{
|
42 |
"type": "image",
|
43 |
+
"key": "/topic-extraction/cells/img_10.jpg_r0_c0.png"
|
44 |
},
|
45 |
{
|
46 |
"type": "image",
|
47 |
+
"key": "/topic-extraction/cells/img_11.jpg_r0_c0.png"
|
48 |
},
|
49 |
{
|
50 |
"type": "image",
|
51 |
+
"key": "/topic-extraction/cells/img_12.jpg_r0_c0.png"
|
52 |
},
|
53 |
{
|
54 |
"type": "image",
|
55 |
+
"key": "/topic-extraction/cells/img_13.jpg_r0_c1.png"
|
56 |
},
|
57 |
{
|
58 |
"type": "image",
|
59 |
+
"key": "/topic-extraction/cells/img_14.jpg_r0_c0.png"
|
60 |
},
|
61 |
{
|
62 |
"type": "image",
|
63 |
+
"key": "/topic-extraction/cells/img_15.jpg_r0_c0.png"
|
64 |
},
|
65 |
{
|
66 |
"type": "image",
|
67 |
+
"key": "/topic-extraction/cells/img_16.jpg_r0_c0.png"
|
68 |
},
|
69 |
{
|
70 |
"type": "image",
|
71 |
+
"key": "/topic-extraction/cells/img_17.jpg_r1_c0.png"
|
72 |
+
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
{
|
74 |
"type": "image",
|
75 |
+
"key": "/topic-extraction/cells/img_18.jpg_r0_c0.png"
|
76 |
+
},
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
{
|
78 |
"type": "image",
|
79 |
+
"key": "/topic-extraction/cells/img_19.jpg_r0_c0.png"
|
80 |
+
},
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
{
|
82 |
"type": "image",
|
83 |
+
"key": "/topic-extraction/cells/img_20.jpg_r0_c0.png"
|
84 |
+
},
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
{
|
86 |
"type": "image",
|
87 |
+
"key": "/topic-extraction/cells/img_21.jpg_r0_c0.png"
|
88 |
+
},
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
{
|
90 |
"type": "image",
|
91 |
+
"key": "/topic-extraction/cells/img_22.jpg_r0_c0.png"
|
92 |
+
},
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
{
|
94 |
"type": "image",
|
95 |
+
"key": "/topic-extraction/cells/img_23.jpg_r0_c0.png"
|
96 |
+
},
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
{
|
98 |
"type": "image",
|
99 |
+
"key": "/topic-extraction/cells/img_24.jpg_r0_c0.png"
|
100 |
+
},
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
{
|
102 |
"type": "image",
|
103 |
+
"key": "/topic-extraction/cells/img_25.jpg_r1_c0.png"
|
104 |
+
},
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
{
|
106 |
"type": "image",
|
107 |
+
"key": "/topic-extraction/cells/img_26.jpg_r0_c0.png"
|
108 |
+
},
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
{
|
110 |
"type": "image",
|
111 |
+
"key": "/topic-extraction/cells/img_27.jpg_r0_c0.png"
|
112 |
+
},
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
{
|
114 |
"type": "image",
|
115 |
+
"key": "/topic-extraction/cells/img_28.jpg_r0_c0.png"
|
116 |
+
},
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
{
|
118 |
"type": "image",
|
119 |
+
"key": "/topic-extraction/cells/img_29.jpg_r0_c0.png"
|
120 |
}
|
121 |
],
|
122 |
"children": []
|
topic_extraction.log
CHANGED
@@ -6931,3 +6931,555 @@ and series'. Using page 7.
|
|
6931 |
2025-03-04 16:17:43,682 [INFO] __main__ - Final subtopics JSON saved locally at /home/user/app/pearson_json/_subtopics.json
|
6932 |
2025-03-04 16:17:43,995 [INFO] __main__ - GPU memory cleaned up.
|
6933 |
2025-03-04 16:17:44,000 [INFO] __main__ - Processing completed successfully.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6931 |
2025-03-04 16:17:43,682 [INFO] __main__ - Final subtopics JSON saved locally at /home/user/app/pearson_json/_subtopics.json
|
6932 |
2025-03-04 16:17:43,995 [INFO] __main__ - GPU memory cleaned up.
|
6933 |
2025-03-04 16:17:44,000 [INFO] __main__ - Processing completed successfully.
|
6934 |
+
2025-03-04 16:39:05,313 [INFO] __main__ - Processing PDF: /home/user/app/input_output/wjec-gce-as-a-economics-specification-from-2015.pdf
|
6935 |
+
2025-03-04 16:39:06,086 [INFO] __main__ - Gemini returned subtopics: {'2.1AS units': [7, 22], '2.2A2 units': [23, 43]}
|
6936 |
+
2025-03-04 16:39:06,088 [INFO] __main__ - Loaded 3543551 bytes from local file '/home/user/app/input_output/wjec-gce-as-a-economics-specification-from-2015.pdf'
|
6937 |
+
2025-03-04 16:39:06,326 [INFO] __main__ - Computed global offset: 0
|
6938 |
+
2025-03-04 16:39:06,326 [INFO] __main__ - Processing pages (0-based): [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42]
|
6939 |
+
2025-03-04 16:39:49,136 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_1.jpg
|
6940 |
+
2025-03-04 16:39:49,708 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_2.jpg
|
6941 |
+
2025-03-04 16:39:50,157 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_3.jpg
|
6942 |
+
2025-03-04 16:39:50,688 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_4.jpg
|
6943 |
+
2025-03-04 16:39:51,083 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_5.jpg
|
6944 |
+
2025-03-04 16:39:51,533 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_6.jpg
|
6945 |
+
2025-03-04 16:39:52,100 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_7.jpg
|
6946 |
+
2025-03-04 16:39:52,532 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_8.jpg
|
6947 |
+
2025-03-04 16:39:52,942 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_9.jpg
|
6948 |
+
2025-03-04 16:39:53,244 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_10.jpg
|
6949 |
+
2025-03-04 16:39:53,742 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_11.jpg
|
6950 |
+
2025-03-04 16:39:54,213 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_12.jpg
|
6951 |
+
2025-03-04 16:39:54,761 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_13.jpg
|
6952 |
+
2025-03-04 16:39:55,050 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_14.jpg
|
6953 |
+
2025-03-04 16:39:55,740 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_15.jpg
|
6954 |
+
2025-03-04 16:39:56,304 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_16.jpg
|
6955 |
+
2025-03-04 16:39:56,780 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_17.jpg
|
6956 |
+
2025-03-04 16:39:57,175 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_18.jpg
|
6957 |
+
2025-03-04 16:39:57,748 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_19.jpg
|
6958 |
+
2025-03-04 16:39:58,140 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_20.jpg
|
6959 |
+
2025-03-04 16:39:58,682 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_21.jpg
|
6960 |
+
2025-03-04 16:39:59,190 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_22.jpg
|
6961 |
+
2025-03-04 16:39:59,577 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_23.jpg
|
6962 |
+
2025-03-04 16:40:00,229 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_24.jpg
|
6963 |
+
2025-03-04 16:40:00,732 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_25.jpg
|
6964 |
+
2025-03-04 16:40:01,136 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_26.jpg
|
6965 |
+
2025-03-04 16:40:01,706 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_27.jpg
|
6966 |
+
2025-03-04 16:40:02,236 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_28.jpg
|
6967 |
+
2025-03-04 16:40:02,621 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_29.jpg
|
6968 |
+
2025-03-04 16:40:03,051 [INFO] __main__ - Classifying images to detect tables.
|
6969 |
+
2025-03-04 16:40:06,927 [INFO] __main__ - Processing table image: /topic-extraction/img_1.jpg, columns=three
|
6970 |
+
2025-03-04 16:40:10,403 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r0_c0.png
|
6971 |
+
2025-03-04 16:40:11,481 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r1_c0.png
|
6972 |
+
2025-03-04 16:40:12,796 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r1_c1.png
|
6973 |
+
2025-03-04 16:40:13,767 [ERROR] __main__ - Gemini subtopic identification error on attempt 0: Expecting value: line 1 column 1 (char 0)
|
6974 |
+
2025-03-04 16:40:15,308 [ERROR] __main__ - Gemini subtopic identification error on attempt 1: Expecting value: line 1 column 1 (char 0)
|
6975 |
+
2025-03-04 16:40:15,585 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r2_c0.png
|
6976 |
+
2025-03-04 16:40:18,265 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r2_c1.png
|
6977 |
+
2025-03-04 16:40:19,708 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r3_c0.png
|
6978 |
+
2025-03-04 16:40:20,908 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r3_c1.png
|
6979 |
+
2025-03-04 16:40:22,033 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r4_c0.png
|
6980 |
+
2025-03-04 16:40:22,999 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_1.jpg_r4_c0.png
|
6981 |
+
2025-03-04 16:40:22,999 [INFO] __main__ - Processing table image: /topic-extraction/img_2.jpg, columns=three
|
6982 |
+
2025-03-04 16:40:26,396 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r0_c0.png
|
6983 |
+
2025-03-04 16:40:27,834 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r1_c0.png
|
6984 |
+
2025-03-04 16:40:29,314 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r1_c1.png
|
6985 |
+
2025-03-04 16:40:30,652 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r2_c0.png
|
6986 |
+
2025-03-04 16:40:32,068 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r2_c1.png
|
6987 |
+
2025-03-04 16:40:33,239 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r3_c0.png
|
6988 |
+
2025-03-04 16:40:34,633 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r3_c1.png
|
6989 |
+
2025-03-04 16:40:35,597 [WARNING] __main__ - Cell image not found: /tmp/tmpkr6p74mz.jpg_rows/row_4/col_0.png
|
6990 |
+
2025-03-04 16:40:35,598 [INFO] __main__ - Processing table image: /topic-extraction/img_3.jpg, columns=three
|
6991 |
+
2025-03-04 16:40:38,470 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_3.jpg_r0_c0.png
|
6992 |
+
2025-03-04 16:40:39,732 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_3.jpg_r1_c0.png
|
6993 |
+
2025-03-04 16:40:41,236 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_3.jpg_r1_c1.png
|
6994 |
+
2025-03-04 16:40:42,293 [WARNING] __main__ - Cell image not found: /tmp/tmp_4ioykgq.jpg_rows/row_2/col_0.png
|
6995 |
+
2025-03-04 16:40:42,293 [INFO] __main__ - Processing table image: /topic-extraction/img_4.jpg, columns=three
|
6996 |
+
2025-03-04 16:40:44,974 [WARNING] __main__ - Cell image not found: /tmp/tmp8qnr07bo.jpg_rows/row_0/col_0.png
|
6997 |
+
2025-03-04 16:40:44,974 [WARNING] __main__ - Cell image not found: /tmp/tmp8qnr07bo.jpg_rows/row_0/col_1.png
|
6998 |
+
2025-03-04 16:40:45,250 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r1_c0.png
|
6999 |
+
2025-03-04 16:40:46,109 [WARNING] __main__ - Cell image not found: /tmp/tmp8qnr07bo.jpg_rows/row_1/col_1.png
|
7000 |
+
2025-03-04 16:40:46,385 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r2_c0.png
|
7001 |
+
2025-03-04 16:40:47,759 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r2_c1.png
|
7002 |
+
2025-03-04 16:40:58,221 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r3_c0.png
|
7003 |
+
2025-03-04 16:40:59,680 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r3_c1.png
|
7004 |
+
2025-03-04 16:41:00,920 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r4_c0.png
|
7005 |
+
2025-03-04 16:41:01,935 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_4.jpg_r4_c0.png
|
7006 |
+
2025-03-04 16:41:01,936 [INFO] __main__ - Processing table image: /topic-extraction/img_5.jpg, columns=three
|
7007 |
+
2025-03-04 16:41:03,077 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_5.jpg_r0_c0.png
|
7008 |
+
2025-03-04 16:41:04,211 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_5.jpg_r1_c0.png
|
7009 |
+
2025-03-04 16:41:05,575 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_5.jpg_r1_c1.png
|
7010 |
+
2025-03-04 16:41:06,564 [WARNING] __main__ - Cell image not found: /tmp/tmph28hdp5v.jpg_rows/row_2/col_0.png
|
7011 |
+
2025-03-04 16:41:06,564 [INFO] __main__ - Processing table image: /topic-extraction/img_6.jpg, columns=three
|
7012 |
+
2025-03-04 16:41:08,734 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_6.jpg_r0_c0.png
|
7013 |
+
2025-03-04 16:41:09,813 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_6.jpg_r1_c0.png
|
7014 |
+
2025-03-04 16:41:11,241 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_6.jpg_r1_c1.png
|
7015 |
+
2025-03-04 16:41:12,566 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_6.jpg_r2_c0.png
|
7016 |
+
2025-03-04 16:41:13,534 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_6.jpg_r2_c0.png
|
7017 |
+
2025-03-04 16:41:13,535 [INFO] __main__ - Processing table image: /topic-extraction/img_7.jpg, columns=three
|
7018 |
+
2025-03-04 16:41:17,251 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r0_c0.png
|
7019 |
+
2025-03-04 16:41:18,440 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r1_c0.png
|
7020 |
+
2025-03-04 16:41:20,101 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r1_c1.png
|
7021 |
+
2025-03-04 16:41:21,465 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r2_c0.png
|
7022 |
+
2025-03-04 16:41:22,836 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r2_c1.png
|
7023 |
+
2025-03-04 16:41:24,168 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r3_c0.png
|
7024 |
+
2025-03-04 16:41:25,738 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_7.jpg_r3_c0.png
|
7025 |
+
2025-03-04 16:41:25,738 [INFO] __main__ - Processing table image: /topic-extraction/img_8.jpg, columns=three
|
7026 |
+
2025-03-04 16:41:28,044 [WARNING] __main__ - Cell image not found: /tmp/tmp2s0xxpac.jpg_rows/row_0/col_0.png
|
7027 |
+
2025-03-04 16:41:28,044 [WARNING] __main__ - Cell image not found: /tmp/tmp2s0xxpac.jpg_rows/row_0/col_1.png
|
7028 |
+
2025-03-04 16:41:28,317 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r1_c0.png
|
7029 |
+
2025-03-04 16:41:29,465 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r2_c0.png
|
7030 |
+
2025-03-04 16:41:30,769 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r2_c1.png
|
7031 |
+
2025-03-04 16:41:32,249 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r3_c0.png
|
7032 |
+
2025-03-04 16:41:33,665 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r3_c1.png
|
7033 |
+
2025-03-04 16:41:34,812 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r4_c0.png
|
7034 |
+
2025-03-04 16:41:35,736 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_8.jpg_r4_c0.png
|
7035 |
+
2025-03-04 16:41:35,737 [INFO] __main__ - Processing table image: /topic-extraction/img_9.jpg, columns=three
|
7036 |
+
2025-03-04 16:41:38,094 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r0_c0.png
|
7037 |
+
2025-03-04 16:41:39,194 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r1_c0.png
|
7038 |
+
2025-03-04 16:41:40,315 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r1_c1.png
|
7039 |
+
2025-03-04 16:41:41,569 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r2_c0.png
|
7040 |
+
2025-03-04 16:41:42,914 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r2_c1.png
|
7041 |
+
2025-03-04 16:41:44,447 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r3_c0.png
|
7042 |
+
2025-03-04 16:41:45,567 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r3_c1.png
|
7043 |
+
2025-03-04 16:41:46,689 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r4_c0.png
|
7044 |
+
2025-03-04 16:41:47,934 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_9.jpg_r4_c0.png
|
7045 |
+
2025-03-04 16:41:47,935 [INFO] __main__ - Processing table image: /topic-extraction/img_10.jpg, columns=three
|
7046 |
+
2025-03-04 16:41:48,856 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_10.jpg_r0_c0.png
|
7047 |
+
2025-03-04 16:41:49,986 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_10.jpg_r1_c0.png
|
7048 |
+
2025-03-04 16:41:51,601 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_10.jpg_r1_c1.png
|
7049 |
+
2025-03-04 16:41:52,769 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_10.jpg_r2_c0.png
|
7050 |
+
2025-03-04 16:41:53,870 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_10.jpg_r2_c0.png
|
7051 |
+
2025-03-04 16:41:53,871 [INFO] __main__ - Processing table image: /topic-extraction/img_11.jpg, columns=three
|
7052 |
+
2025-03-04 16:41:56,443 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_11.jpg_r0_c0.png
|
7053 |
+
2025-03-04 16:41:57,555 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_11.jpg_r1_c0.png
|
7054 |
+
2025-03-04 16:41:59,137 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_11.jpg_r1_c1.png
|
7055 |
+
2025-03-04 16:42:00,476 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_11.jpg_r2_c0.png
|
7056 |
+
2025-03-04 16:42:01,609 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_11.jpg_r2_c0.png
|
7057 |
+
2025-03-04 16:42:01,610 [INFO] __main__ - Processing table image: /topic-extraction/img_12.jpg, columns=three
|
7058 |
+
2025-03-04 16:42:04,361 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_12.jpg_r0_c0.png
|
7059 |
+
2025-03-04 16:42:05,501 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_12.jpg_r1_c0.png
|
7060 |
+
2025-03-04 16:42:07,023 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_12.jpg_r1_c1.png
|
7061 |
+
2025-03-04 16:42:08,304 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_12.jpg_r2_c0.png
|
7062 |
+
2025-03-04 16:42:09,424 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_12.jpg_r2_c0.png
|
7063 |
+
2025-03-04 16:42:09,425 [INFO] __main__ - Processing table image: /topic-extraction/img_13.jpg, columns=three
|
7064 |
+
2025-03-04 16:42:12,614 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r0_c0.png
|
7065 |
+
2025-03-04 16:42:13,436 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_13.jpg_r0_c0.png
|
7066 |
+
2025-03-04 16:42:13,691 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r0_c1.png
|
7067 |
+
2025-03-04 16:42:14,930 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r1_c0.png
|
7068 |
+
2025-03-04 16:42:16,395 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r1_c1.png
|
7069 |
+
2025-03-04 16:42:17,794 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r2_c0.png
|
7070 |
+
2025-03-04 16:42:18,797 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_13.jpg_r2_c0.png
|
7071 |
+
2025-03-04 16:42:19,053 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r3_c0.png
|
7072 |
+
2025-03-04 16:42:20,378 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r3_c1.png
|
7073 |
+
2025-03-04 16:42:21,565 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r4_c0.png
|
7074 |
+
2025-03-04 16:42:22,635 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_13.jpg_r4_c0.png
|
7075 |
+
2025-03-04 16:42:22,635 [INFO] __main__ - Processing table image: /topic-extraction/img_14.jpg, columns=three
|
7076 |
+
2025-03-04 16:42:23,713 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_14.jpg_r0_c0.png
|
7077 |
+
2025-03-04 16:42:24,787 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_14.jpg_r1_c0.png
|
7078 |
+
2025-03-04 16:42:26,077 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_14.jpg_r1_c1.png
|
7079 |
+
2025-03-04 16:42:27,195 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_14.jpg_r2_c0.png
|
7080 |
+
2025-03-04 16:42:28,273 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_14.jpg_r2_c0.png
|
7081 |
+
2025-03-04 16:42:28,274 [INFO] __main__ - Processing table image: /topic-extraction/img_15.jpg, columns=three
|
7082 |
+
2025-03-04 16:42:32,234 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r0_c0.png
|
7083 |
+
2025-03-04 16:42:33,414 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r1_c0.png
|
7084 |
+
2025-03-04 16:42:35,062 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r1_c1.png
|
7085 |
+
2025-03-04 16:42:36,362 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r2_c0.png
|
7086 |
+
2025-03-04 16:42:37,790 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r2_c1.png
|
7087 |
+
2025-03-04 16:42:38,877 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r3_c0.png
|
7088 |
+
2025-03-04 16:42:40,011 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r3_c1.png
|
7089 |
+
2025-03-04 16:42:41,094 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r4_c0.png
|
7090 |
+
2025-03-04 16:42:42,019 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_15.jpg_r4_c0.png
|
7091 |
+
2025-03-04 16:42:42,020 [INFO] __main__ - Processing table image: /topic-extraction/img_16.jpg, columns=three
|
7092 |
+
2025-03-04 16:42:45,163 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r0_c0.png
|
7093 |
+
2025-03-04 16:42:46,253 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r1_c0.png
|
7094 |
+
2025-03-04 16:42:47,665 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r1_c1.png
|
7095 |
+
2025-03-04 16:42:48,812 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r2_c0.png
|
7096 |
+
2025-03-04 16:42:50,033 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r2_c1.png
|
7097 |
+
2025-03-04 16:42:51,432 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r3_c0.png
|
7098 |
+
2025-03-04 16:42:52,858 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r3_c1.png
|
7099 |
+
2025-03-04 16:42:54,216 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r4_c0.png
|
7100 |
+
2025-03-04 16:42:55,778 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r4_c1.png
|
7101 |
+
2025-03-04 16:42:56,931 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r5_c0.png
|
7102 |
+
2025-03-04 16:42:57,851 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_16.jpg_r5_c0.png
|
7103 |
+
2025-03-04 16:42:57,851 [INFO] __main__ - Processing table image: /topic-extraction/img_17.jpg, columns=three
|
7104 |
+
2025-03-04 16:43:01,201 [WARNING] __main__ - Cell image not found: /tmp/tmpdmvh3rc8.jpg_rows/row_0/col_0.png
|
7105 |
+
2025-03-04 16:43:01,475 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r1_c0.png
|
7106 |
+
2025-03-04 16:43:02,567 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r2_c0.png
|
7107 |
+
2025-03-04 16:43:04,176 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r2_c1.png
|
7108 |
+
2025-03-04 16:43:05,365 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r3_c0.png
|
7109 |
+
2025-03-04 16:43:06,802 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r3_c1.png
|
7110 |
+
2025-03-04 16:43:07,969 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r4_c0.png
|
7111 |
+
2025-03-04 16:43:08,946 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_17.jpg_r4_c0.png
|
7112 |
+
2025-03-04 16:43:08,946 [INFO] __main__ - Processing table image: /topic-extraction/img_18.jpg, columns=three
|
7113 |
+
2025-03-04 16:43:10,806 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r0_c0.png
|
7114 |
+
2025-03-04 16:43:11,925 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r1_c0.png
|
7115 |
+
2025-03-04 16:43:13,286 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r1_c1.png
|
7116 |
+
2025-03-04 16:43:14,651 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r2_c0.png
|
7117 |
+
2025-03-04 16:43:16,070 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r2_c1.png
|
7118 |
+
2025-03-04 16:43:17,209 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r3_c0.png
|
7119 |
+
2025-03-04 16:43:18,206 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_18.jpg_r3_c0.png
|
7120 |
+
2025-03-04 16:43:18,206 [INFO] __main__ - Processing table image: /topic-extraction/img_19.jpg, columns=three
|
7121 |
+
2025-03-04 16:43:21,350 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r0_c0.png
|
7122 |
+
2025-03-04 16:43:22,479 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r1_c0.png
|
7123 |
+
2025-03-04 16:43:24,003 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r1_c1.png
|
7124 |
+
2025-03-04 16:43:25,317 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r2_c0.png
|
7125 |
+
2025-03-04 16:43:26,815 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r2_c1.png
|
7126 |
+
2025-03-04 16:43:28,078 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r3_c0.png
|
7127 |
+
2025-03-04 16:43:29,086 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_19.jpg_r3_c0.png
|
7128 |
+
2025-03-04 16:43:29,086 [INFO] __main__ - Processing table image: /topic-extraction/img_20.jpg, columns=three
|
7129 |
+
2025-03-04 16:43:30,918 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_20.jpg_r0_c0.png
|
7130 |
+
2025-03-04 16:43:32,141 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_20.jpg_r1_c0.png
|
7131 |
+
2025-03-04 16:43:33,282 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_20.jpg_r1_c1.png
|
7132 |
+
2025-03-04 16:43:34,592 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_20.jpg_r2_c0.png
|
7133 |
+
2025-03-04 16:43:36,080 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_20.jpg_r2_c1.png
|
7134 |
+
2025-03-04 16:43:37,530 [WARNING] __main__ - Cell image not found: /tmp/tmp6_d2lvpn.jpg_rows/row_3/col_0.png
|
7135 |
+
2025-03-04 16:43:37,531 [INFO] __main__ - Processing table image: /topic-extraction/img_21.jpg, columns=three
|
7136 |
+
2025-03-04 16:43:40,529 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_21.jpg_r0_c0.png
|
7137 |
+
2025-03-04 16:43:41,854 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_21.jpg_r1_c0.png
|
7138 |
+
2025-03-04 16:43:43,415 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_21.jpg_r1_c1.png
|
7139 |
+
2025-03-04 16:43:45,170 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_21.jpg_r2_c0.png
|
7140 |
+
2025-03-04 16:43:46,291 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_21.jpg_r2_c0.png
|
7141 |
+
2025-03-04 16:43:46,292 [INFO] __main__ - Processing table image: /topic-extraction/img_22.jpg, columns=three
|
7142 |
+
2025-03-04 16:43:48,973 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_22.jpg_r0_c0.png
|
7143 |
+
2025-03-04 16:43:50,109 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_22.jpg_r1_c0.png
|
7144 |
+
2025-03-04 16:43:51,618 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_22.jpg_r1_c1.png
|
7145 |
+
2025-03-04 16:43:52,724 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_22.jpg_r1_c1.png
|
7146 |
+
2025-03-04 16:43:52,904 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_22.jpg_r2_c0.png
|
7147 |
+
2025-03-04 16:43:54,163 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_22.jpg_r2_c0.png
|
7148 |
+
2025-03-04 16:43:54,163 [INFO] __main__ - Processing table image: /topic-extraction/img_23.jpg, columns=three
|
7149 |
+
2025-03-04 16:43:56,200 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_23.jpg_r0_c0.png
|
7150 |
+
2025-03-04 16:43:57,589 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_23.jpg_r1_c0.png
|
7151 |
+
2025-03-04 16:43:59,010 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_23.jpg_r1_c1.png
|
7152 |
+
2025-03-04 16:44:00,106 [WARNING] __main__ - Cell image not found: /tmp/tmp5l7mn427.jpg_rows/row_2/col_0.png
|
7153 |
+
2025-03-04 16:44:00,107 [INFO] __main__ - Processing table image: /topic-extraction/img_24.jpg, columns=three
|
7154 |
+
2025-03-04 16:44:03,906 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_24.jpg_r0_c0.png
|
7155 |
+
2025-03-04 16:44:05,120 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_24.jpg_r1_c0.png
|
7156 |
+
2025-03-04 16:44:06,699 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_24.jpg_r1_c1.png
|
7157 |
+
2025-03-04 16:44:08,013 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_24.jpg_r1_c1.png
|
7158 |
+
2025-03-04 16:44:08,014 [WARNING] __main__ - Cell image not found: /tmp/tmp7f4e012v.jpg_rows/row_2/col_0.png
|
7159 |
+
2025-03-04 16:44:08,019 [INFO] __main__ - Processing table image: /topic-extraction/img_25.jpg, columns=three
|
7160 |
+
2025-03-04 16:44:10,562 [WARNING] __main__ - Cell image not found: /tmp/tmpsr1107vb.jpg_rows/row_0/col_0.png
|
7161 |
+
2025-03-04 16:44:10,823 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_25.jpg_r1_c0.png
|
7162 |
+
2025-03-04 16:44:12,067 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_25.jpg_r2_c0.png
|
7163 |
+
2025-03-04 16:44:13,630 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_25.jpg_r2_c1.png
|
7164 |
+
2025-03-04 16:44:15,001 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_25.jpg_r3_c0.png
|
7165 |
+
2025-03-04 16:44:16,162 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_25.jpg_r3_c0.png
|
7166 |
+
2025-03-04 16:44:16,163 [INFO] __main__ - Processing table image: /topic-extraction/img_26.jpg, columns=three
|
7167 |
+
2025-03-04 16:44:18,257 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_26.jpg_r0_c0.png
|
7168 |
+
2025-03-04 16:44:19,367 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_26.jpg_r1_c0.png
|
7169 |
+
2025-03-04 16:44:20,866 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_26.jpg_r1_c1.png
|
7170 |
+
2025-03-04 16:44:22,029 [WARNING] __main__ - Cell image not found: /tmp/tmpm4jr09co.jpg_rows/row_2/col_0.png
|
7171 |
+
2025-03-04 16:44:22,030 [INFO] __main__ - Processing table image: /topic-extraction/img_27.jpg, columns=three
|
7172 |
+
2025-03-04 16:44:25,458 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r0_c0.png
|
7173 |
+
2025-03-04 16:44:26,636 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r1_c0.png
|
7174 |
+
2025-03-04 16:44:28,117 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r1_c1.png
|
7175 |
+
2025-03-04 16:44:29,316 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r2_c0.png
|
7176 |
+
2025-03-04 16:44:30,892 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r2_c1.png
|
7177 |
+
2025-03-04 16:44:32,031 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r3_c0.png
|
7178 |
+
2025-03-04 16:44:32,983 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_27.jpg_r3_c0.png
|
7179 |
+
2025-03-04 16:44:32,984 [INFO] __main__ - Processing table image: /topic-extraction/img_28.jpg, columns=three
|
7180 |
+
2025-03-04 16:44:35,702 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_28.jpg_r0_c0.png
|
7181 |
+
2025-03-04 16:44:37,077 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_28.jpg_r1_c0.png
|
7182 |
+
2025-03-04 16:44:38,586 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_28.jpg_r1_c1.png
|
7183 |
+
2025-03-04 16:44:40,000 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_28.jpg_r2_c0.png
|
7184 |
+
2025-03-04 16:44:41,005 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_28.jpg_r2_c0.png
|
7185 |
+
2025-03-04 16:44:41,006 [INFO] __main__ - Processing table image: /topic-extraction/img_29.jpg, columns=three
|
7186 |
+
2025-03-04 16:44:42,801 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_29.jpg_r0_c0.png
|
7187 |
+
2025-03-04 16:44:43,877 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_29.jpg_r1_c0.png
|
7188 |
+
2025-03-04 16:44:45,297 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_29.jpg_r1_c1.png
|
7189 |
+
2025-03-04 16:44:46,572 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_29.jpg_r2_c0.png
|
7190 |
+
2025-03-04 16:44:47,560 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_29.jpg_r2_c0.png
|
7191 |
+
2025-03-04 16:44:47,564 [INFO] __main__ - Final subtopics JSON saved locally at /home/user/app/pearson_json/_subtopics.json
|
7192 |
+
2025-03-04 16:44:47,893 [INFO] __main__ - GPU memory cleaned up.
|
7193 |
+
2025-03-04 16:44:47,898 [INFO] __main__ - Processing completed successfully.
|
7194 |
+
2025-03-04 17:13:14,000 [INFO] __main__ - Processing PDF: /home/user/app/input_output/wjec-gce-as-a-economics-specification-from-2015.pdf
|
7195 |
+
2025-03-04 17:13:14,813 [INFO] __main__ - Gemini returned subtopics: {'2.1AS units': [7, 22], '2.2A2 units': [23, 43]}
|
7196 |
+
2025-03-04 17:13:14,814 [INFO] __main__ - Loaded 3543551 bytes from local file '/home/user/app/input_output/wjec-gce-as-a-economics-specification-from-2015.pdf'
|
7197 |
+
2025-03-04 17:13:15,049 [INFO] __main__ - Computed global offset: 0
|
7198 |
+
2025-03-04 17:13:15,049 [INFO] __main__ - Processing pages (0-based): [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42]
|
7199 |
+
2025-03-04 17:13:55,840 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_1.jpg
|
7200 |
+
2025-03-04 17:13:56,487 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_2.jpg
|
7201 |
+
2025-03-04 17:13:56,943 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_3.jpg
|
7202 |
+
2025-03-04 17:13:57,441 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_4.jpg
|
7203 |
+
2025-03-04 17:13:57,816 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_5.jpg
|
7204 |
+
2025-03-04 17:13:58,206 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_6.jpg
|
7205 |
+
2025-03-04 17:13:58,724 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_7.jpg
|
7206 |
+
2025-03-04 17:13:59,172 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_8.jpg
|
7207 |
+
2025-03-04 17:13:59,579 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_9.jpg
|
7208 |
+
2025-03-04 17:13:59,870 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_10.jpg
|
7209 |
+
2025-03-04 17:14:00,375 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_11.jpg
|
7210 |
+
2025-03-04 17:14:00,860 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_12.jpg
|
7211 |
+
2025-03-04 17:14:01,418 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_13.jpg
|
7212 |
+
2025-03-04 17:14:01,705 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_14.jpg
|
7213 |
+
2025-03-04 17:14:02,299 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_15.jpg
|
7214 |
+
2025-03-04 17:14:02,835 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_16.jpg
|
7215 |
+
2025-03-04 17:14:03,343 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_17.jpg
|
7216 |
+
2025-03-04 17:14:03,722 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_18.jpg
|
7217 |
+
2025-03-04 17:14:04,256 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_19.jpg
|
7218 |
+
2025-03-04 17:14:04,622 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_20.jpg
|
7219 |
+
2025-03-04 17:14:05,155 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_21.jpg
|
7220 |
+
2025-03-04 17:14:05,630 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_22.jpg
|
7221 |
+
2025-03-04 17:14:06,016 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_23.jpg
|
7222 |
+
2025-03-04 17:14:06,624 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_24.jpg
|
7223 |
+
2025-03-04 17:14:07,057 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_25.jpg
|
7224 |
+
2025-03-04 17:14:07,468 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_26.jpg
|
7225 |
+
2025-03-04 17:14:08,013 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_27.jpg
|
7226 |
+
2025-03-04 17:14:08,559 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_28.jpg
|
7227 |
+
2025-03-04 17:14:08,944 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_29.jpg
|
7228 |
+
2025-03-04 17:14:09,370 [INFO] __main__ - Classifying images to detect tables.
|
7229 |
+
2025-03-04 17:14:13,356 [INFO] __main__ - Processing table image: /topic-extraction/img_1.jpg, columns=three
|
7230 |
+
2025-03-04 17:14:16,548 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r0_c0.png
|
7231 |
+
2025-03-04 17:14:17,824 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r1_c0.png
|
7232 |
+
2025-03-04 17:14:19,207 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r1_c1.png
|
7233 |
+
2025-03-04 17:14:20,785 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r2_c0.png
|
7234 |
+
2025-03-04 17:14:22,337 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r2_c1.png
|
7235 |
+
2025-03-04 17:14:24,117 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r3_c0.png
|
7236 |
+
2025-03-04 17:14:25,468 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r3_c1.png
|
7237 |
+
2025-03-04 17:14:26,797 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r4_c0.png
|
7238 |
+
2025-03-04 17:14:27,715 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_1.jpg_r4_c0.png
|
7239 |
+
2025-03-04 17:14:27,715 [INFO] __main__ - Processing table image: /topic-extraction/img_2.jpg, columns=three
|
7240 |
+
2025-03-04 17:14:31,016 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r0_c0.png
|
7241 |
+
2025-03-04 17:14:32,468 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r1_c0.png
|
7242 |
+
2025-03-04 17:14:34,010 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r1_c1.png
|
7243 |
+
2025-03-04 17:14:37,127 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r2_c0.png
|
7244 |
+
2025-03-04 17:14:38,574 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r2_c1.png
|
7245 |
+
2025-03-04 17:14:40,014 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r3_c0.png
|
7246 |
+
2025-03-04 17:14:41,453 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r3_c1.png
|
7247 |
+
2025-03-04 17:14:43,026 [WARNING] __main__ - Cell image not found: /tmp/tmpgz3m3b9n.jpg_rows/row_4/col_0.png
|
7248 |
+
2025-03-04 17:14:43,026 [INFO] __main__ - Processing table image: /topic-extraction/img_3.jpg, columns=three
|
7249 |
+
2025-03-04 17:14:45,066 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_3.jpg_r0_c0.png
|
7250 |
+
2025-03-04 17:14:46,513 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_3.jpg_r1_c0.png
|
7251 |
+
2025-03-04 17:14:48,054 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_3.jpg_r1_c1.png
|
7252 |
+
2025-03-04 17:14:49,517 [WARNING] __main__ - Cell image not found: /tmp/tmpkn9damk4.jpg_rows/row_2/col_0.png
|
7253 |
+
2025-03-04 17:14:49,518 [INFO] __main__ - Processing table image: /topic-extraction/img_4.jpg, columns=three
|
7254 |
+
2025-03-04 17:14:51,857 [WARNING] __main__ - Cell image not found: /tmp/tmp_7v9cvwb.jpg_rows/row_0/col_0.png
|
7255 |
+
2025-03-04 17:14:51,857 [WARNING] __main__ - Cell image not found: /tmp/tmp_7v9cvwb.jpg_rows/row_0/col_1.png
|
7256 |
+
2025-03-04 17:14:52,128 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r1_c0.png
|
7257 |
+
2025-03-04 17:14:53,246 [WARNING] __main__ - Cell image not found: /tmp/tmp_7v9cvwb.jpg_rows/row_1/col_1.png
|
7258 |
+
2025-03-04 17:14:53,522 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r2_c0.png
|
7259 |
+
2025-03-04 17:14:54,896 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r2_c1.png
|
7260 |
+
2025-03-04 17:14:56,522 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r3_c0.png
|
7261 |
+
2025-03-04 17:14:57,958 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r3_c1.png
|
7262 |
+
2025-03-04 17:14:59,510 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r4_c0.png
|
7263 |
+
2025-03-04 17:15:00,493 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_4.jpg_r4_c0.png
|
7264 |
+
2025-03-04 17:15:00,494 [INFO] __main__ - Processing table image: /topic-extraction/img_5.jpg, columns=three
|
7265 |
+
2025-03-04 17:15:01,571 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_5.jpg_r0_c0.png
|
7266 |
+
2025-03-04 17:15:02,997 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_5.jpg_r1_c0.png
|
7267 |
+
2025-03-04 17:15:04,424 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_5.jpg_r1_c1.png
|
7268 |
+
2025-03-04 17:15:05,776 [WARNING] __main__ - Cell image not found: /tmp/tmpr68pawul.jpg_rows/row_2/col_0.png
|
7269 |
+
2025-03-04 17:15:05,776 [INFO] __main__ - Processing table image: /topic-extraction/img_6.jpg, columns=three
|
7270 |
+
2025-03-04 17:15:07,783 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_6.jpg_r0_c0.png
|
7271 |
+
2025-03-04 17:15:09,073 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_6.jpg_r1_c0.png
|
7272 |
+
2025-03-04 17:15:10,711 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_6.jpg_r1_c1.png
|
7273 |
+
2025-03-04 17:15:12,507 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_6.jpg_r2_c0.png
|
7274 |
+
2025-03-04 17:15:13,630 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_6.jpg_r2_c0.png
|
7275 |
+
2025-03-04 17:15:13,631 [INFO] __main__ - Processing table image: /topic-extraction/img_7.jpg, columns=three
|
7276 |
+
2025-03-04 17:15:16,878 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r0_c0.png
|
7277 |
+
2025-03-04 17:15:18,344 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r1_c0.png
|
7278 |
+
2025-03-04 17:15:19,949 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r1_c1.png
|
7279 |
+
2025-03-04 17:15:22,552 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r2_c0.png
|
7280 |
+
2025-03-04 17:15:23,888 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r2_c1.png
|
7281 |
+
2025-03-04 17:15:25,222 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r3_c0.png
|
7282 |
+
2025-03-04 17:15:26,200 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_7.jpg_r3_c0.png
|
7283 |
+
2025-03-04 17:15:26,200 [INFO] __main__ - Processing table image: /topic-extraction/img_8.jpg, columns=three
|
7284 |
+
2025-03-04 17:15:28,378 [WARNING] __main__ - Cell image not found: /tmp/tmpeauayzcm.jpg_rows/row_0/col_0.png
|
7285 |
+
2025-03-04 17:15:28,378 [WARNING] __main__ - Cell image not found: /tmp/tmpeauayzcm.jpg_rows/row_0/col_1.png
|
7286 |
+
2025-03-04 17:15:28,642 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r1_c0.png
|
7287 |
+
2025-03-04 17:15:30,092 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r2_c0.png
|
7288 |
+
2025-03-04 17:15:31,485 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r2_c1.png
|
7289 |
+
2025-03-04 17:15:33,367 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r3_c0.png
|
7290 |
+
2025-03-04 17:15:34,783 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r3_c1.png
|
7291 |
+
2025-03-04 17:15:36,384 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r4_c0.png
|
7292 |
+
2025-03-04 17:15:37,395 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_8.jpg_r4_c0.png
|
7293 |
+
2025-03-04 17:15:37,396 [INFO] __main__ - Processing table image: /topic-extraction/img_9.jpg, columns=three
|
7294 |
+
2025-03-04 17:15:39,469 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r0_c0.png
|
7295 |
+
2025-03-04 17:15:40,865 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r1_c0.png
|
7296 |
+
2025-03-04 17:15:42,177 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r1_c1.png
|
7297 |
+
2025-03-04 17:15:43,748 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r2_c0.png
|
7298 |
+
2025-03-04 17:15:45,111 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r2_c1.png
|
7299 |
+
2025-03-04 17:15:47,334 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r3_c0.png
|
7300 |
+
2025-03-04 17:15:48,513 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r3_c1.png
|
7301 |
+
2025-03-04 17:15:49,748 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r4_c0.png
|
7302 |
+
2025-03-04 17:15:50,582 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_9.jpg_r4_c0.png
|
7303 |
+
2025-03-04 17:15:50,582 [INFO] __main__ - Processing table image: /topic-extraction/img_10.jpg, columns=three
|
7304 |
+
2025-03-04 17:15:51,570 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_10.jpg_r0_c0.png
|
7305 |
+
2025-03-04 17:15:53,041 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_10.jpg_r1_c0.png
|
7306 |
+
2025-03-04 17:15:54,468 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_10.jpg_r1_c1.png
|
7307 |
+
2025-03-04 17:15:55,844 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_10.jpg_r2_c0.png
|
7308 |
+
2025-03-04 17:15:56,966 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_10.jpg_r2_c0.png
|
7309 |
+
2025-03-04 17:15:56,967 [INFO] __main__ - Processing table image: /topic-extraction/img_11.jpg, columns=three
|
7310 |
+
2025-03-04 17:15:59,374 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_11.jpg_r0_c0.png
|
7311 |
+
2025-03-04 17:16:00,804 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_11.jpg_r1_c0.png
|
7312 |
+
2025-03-04 17:16:02,580 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_11.jpg_r1_c1.png
|
7313 |
+
2025-03-04 17:16:04,173 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_11.jpg_r2_c0.png
|
7314 |
+
2025-03-04 17:16:05,062 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_11.jpg_r2_c0.png
|
7315 |
+
2025-03-04 17:16:05,062 [INFO] __main__ - Processing table image: /topic-extraction/img_12.jpg, columns=three
|
7316 |
+
2025-03-04 17:16:07,653 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_12.jpg_r0_c0.png
|
7317 |
+
2025-03-04 17:16:09,201 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_12.jpg_r1_c0.png
|
7318 |
+
2025-03-04 17:16:10,928 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_12.jpg_r1_c1.png
|
7319 |
+
2025-03-04 17:16:12,739 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_12.jpg_r2_c0.png
|
7320 |
+
2025-03-04 17:16:13,735 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_12.jpg_r2_c0.png
|
7321 |
+
2025-03-04 17:16:13,735 [INFO] __main__ - Processing table image: /topic-extraction/img_13.jpg, columns=three
|
7322 |
+
2025-03-04 17:16:16,756 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r0_c0.png
|
7323 |
+
2025-03-04 17:16:17,689 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_13.jpg_r0_c0.png
|
7324 |
+
2025-03-04 17:16:17,947 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r0_c1.png
|
7325 |
+
2025-03-04 17:16:19,521 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r1_c0.png
|
7326 |
+
2025-03-04 17:16:21,310 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r1_c1.png
|
7327 |
+
2025-03-04 17:16:23,370 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r2_c0.png
|
7328 |
+
2025-03-04 17:16:24,380 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_13.jpg_r2_c0.png
|
7329 |
+
2025-03-04 17:16:24,634 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r3_c0.png
|
7330 |
+
2025-03-04 17:16:26,009 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r3_c1.png
|
7331 |
+
2025-03-04 17:16:27,859 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r4_c0.png
|
7332 |
+
2025-03-04 17:16:28,943 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_13.jpg_r4_c0.png
|
7333 |
+
2025-03-04 17:16:28,943 [INFO] __main__ - Processing table image: /topic-extraction/img_14.jpg, columns=three
|
7334 |
+
2025-03-04 17:16:30,062 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_14.jpg_r0_c0.png
|
7335 |
+
2025-03-04 17:16:31,485 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_14.jpg_r1_c0.png
|
7336 |
+
2025-03-04 17:16:32,831 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_14.jpg_r1_c1.png
|
7337 |
+
2025-03-04 17:16:34,357 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_14.jpg_r2_c0.png
|
7338 |
+
2025-03-04 17:16:35,420 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_14.jpg_r2_c0.png
|
7339 |
+
2025-03-04 17:16:35,420 [INFO] __main__ - Processing table image: /topic-extraction/img_15.jpg, columns=three
|
7340 |
+
2025-03-04 17:16:38,997 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r0_c0.png
|
7341 |
+
2025-03-04 17:16:40,384 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r1_c0.png
|
7342 |
+
2025-03-04 17:16:42,086 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r1_c1.png
|
7343 |
+
2025-03-04 17:16:43,960 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r2_c0.png
|
7344 |
+
2025-03-04 17:16:45,362 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r2_c1.png
|
7345 |
+
2025-03-04 17:16:47,152 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r3_c0.png
|
7346 |
+
2025-03-04 17:16:48,540 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r3_c1.png
|
7347 |
+
2025-03-04 17:16:49,983 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r4_c0.png
|
7348 |
+
2025-03-04 17:16:51,054 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_15.jpg_r4_c0.png
|
7349 |
+
2025-03-04 17:16:51,054 [INFO] __main__ - Processing table image: /topic-extraction/img_16.jpg, columns=three
|
7350 |
+
2025-03-04 17:16:54,130 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r0_c0.png
|
7351 |
+
2025-03-04 17:16:55,482 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r1_c0.png
|
7352 |
+
2025-03-04 17:16:56,770 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r1_c1.png
|
7353 |
+
2025-03-04 17:16:58,649 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r2_c0.png
|
7354 |
+
2025-03-04 17:16:59,944 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r2_c1.png
|
7355 |
+
2025-03-04 17:17:01,397 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r3_c0.png
|
7356 |
+
2025-03-04 17:17:02,716 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r3_c1.png
|
7357 |
+
2025-03-04 17:17:04,306 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r4_c0.png
|
7358 |
+
2025-03-04 17:17:05,735 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r4_c1.png
|
7359 |
+
2025-03-04 17:17:07,796 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r5_c0.png
|
7360 |
+
2025-03-04 17:17:08,712 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_16.jpg_r5_c0.png
|
7361 |
+
2025-03-04 17:17:08,712 [INFO] __main__ - Processing table image: /topic-extraction/img_17.jpg, columns=three
|
7362 |
+
2025-03-04 17:17:11,791 [WARNING] __main__ - Cell image not found: /tmp/tmp8t199g9l.jpg_rows/row_0/col_0.png
|
7363 |
+
2025-03-04 17:17:12,053 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r1_c0.png
|
7364 |
+
2025-03-04 17:17:13,388 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r2_c0.png
|
7365 |
+
2025-03-04 17:17:14,808 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r2_c1.png
|
7366 |
+
2025-03-04 17:17:16,828 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r3_c0.png
|
7367 |
+
2025-03-04 17:17:18,305 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r3_c1.png
|
7368 |
+
2025-03-04 17:17:20,126 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r4_c0.png
|
7369 |
+
2025-03-04 17:17:21,082 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_17.jpg_r4_c0.png
|
7370 |
+
2025-03-04 17:17:21,082 [INFO] __main__ - Processing table image: /topic-extraction/img_18.jpg, columns=three
|
7371 |
+
2025-03-04 17:17:22,864 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r0_c0.png
|
7372 |
+
2025-03-04 17:17:24,349 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r1_c0.png
|
7373 |
+
2025-03-04 17:17:25,674 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r1_c1.png
|
7374 |
+
2025-03-04 17:17:27,576 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r2_c0.png
|
7375 |
+
2025-03-04 17:17:28,875 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r2_c1.png
|
7376 |
+
2025-03-04 17:17:30,338 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r3_c0.png
|
7377 |
+
2025-03-04 17:17:31,459 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_18.jpg_r3_c0.png
|
7378 |
+
2025-03-04 17:17:31,460 [INFO] __main__ - Processing table image: /topic-extraction/img_19.jpg, columns=three
|
7379 |
+
2025-03-04 17:17:33,895 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r0_c0.png
|
7380 |
+
2025-03-04 17:17:35,505 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r1_c0.png
|
7381 |
+
2025-03-04 17:17:36,920 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r1_c1.png
|
7382 |
+
2025-03-04 17:17:38,707 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r2_c0.png
|
7383 |
+
2025-03-04 17:17:40,159 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r2_c1.png
|
7384 |
+
2025-03-04 17:17:42,150 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r3_c0.png
|
7385 |
+
2025-03-04 17:17:43,069 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_19.jpg_r3_c0.png
|
7386 |
+
2025-03-04 17:17:43,069 [INFO] __main__ - Processing table image: /topic-extraction/img_20.jpg, columns=three
|
7387 |
+
2025-03-04 17:17:44,770 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_20.jpg_r0_c0.png
|
7388 |
+
2025-03-04 17:17:46,112 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_20.jpg_r1_c0.png
|
7389 |
+
2025-03-04 17:17:47,369 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_20.jpg_r1_c1.png
|
7390 |
+
2025-03-04 17:17:48,764 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_20.jpg_r2_c0.png
|
7391 |
+
2025-03-04 17:17:50,279 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_20.jpg_r2_c1.png
|
7392 |
+
2025-03-04 17:17:52,008 [WARNING] __main__ - Cell image not found: /tmp/tmpyuhd9sl8.jpg_rows/row_3/col_0.png
|
7393 |
+
2025-03-04 17:17:52,009 [INFO] __main__ - Processing table image: /topic-extraction/img_21.jpg, columns=three
|
7394 |
+
2025-03-04 17:17:54,856 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_21.jpg_r0_c0.png
|
7395 |
+
2025-03-04 17:17:56,238 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_21.jpg_r1_c0.png
|
7396 |
+
2025-03-04 17:17:58,121 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_21.jpg_r1_c1.png
|
7397 |
+
2025-03-04 17:18:00,408 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_21.jpg_r2_c0.png
|
7398 |
+
2025-03-04 17:18:01,418 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_21.jpg_r2_c0.png
|
7399 |
+
2025-03-04 17:18:01,418 [INFO] __main__ - Processing table image: /topic-extraction/img_22.jpg, columns=three
|
7400 |
+
2025-03-04 17:18:03,917 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_22.jpg_r0_c0.png
|
7401 |
+
2025-03-04 17:18:05,292 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_22.jpg_r1_c0.png
|
7402 |
+
2025-03-04 17:18:07,082 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_22.jpg_r1_c1.png
|
7403 |
+
2025-03-04 17:18:08,934 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_22.jpg_r2_c0.png
|
7404 |
+
2025-03-04 17:18:10,012 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_22.jpg_r2_c0.png
|
7405 |
+
2025-03-04 17:18:10,012 [INFO] __main__ - Processing table image: /topic-extraction/img_23.jpg, columns=three
|
7406 |
+
2025-03-04 17:18:11,952 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_23.jpg_r0_c0.png
|
7407 |
+
2025-03-04 17:18:13,275 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_23.jpg_r1_c0.png
|
7408 |
+
2025-03-04 17:18:14,714 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_23.jpg_r1_c1.png
|
7409 |
+
2025-03-04 17:18:16,140 [WARNING] __main__ - Cell image not found: /tmp/tmp91opcy4g.jpg_rows/row_2/col_0.png
|
7410 |
+
2025-03-04 17:18:16,140 [INFO] __main__ - Processing table image: /topic-extraction/img_24.jpg, columns=three
|
7411 |
+
2025-03-04 17:18:19,748 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_24.jpg_r0_c0.png
|
7412 |
+
2025-03-04 17:18:21,092 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_24.jpg_r1_c0.png
|
7413 |
+
2025-03-04 17:18:23,324 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_24.jpg_r1_c1.png
|
7414 |
+
2025-03-04 17:18:26,880 [WARNING] __main__ - Cell image not found: /tmp/tmpigalpv91.jpg_rows/row_2/col_0.png
|
7415 |
+
2025-03-04 17:18:26,880 [INFO] __main__ - Processing table image: /topic-extraction/img_25.jpg, columns=three
|
7416 |
+
2025-03-04 17:18:29,208 [WARNING] __main__ - Cell image not found: /tmp/tmppaoedyal.jpg_rows/row_0/col_0.png
|
7417 |
+
2025-03-04 17:18:29,475 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_25.jpg_r1_c0.png
|
7418 |
+
2025-03-04 17:18:30,947 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_25.jpg_r2_c0.png
|
7419 |
+
2025-03-04 17:18:33,064 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_25.jpg_r2_c1.png
|
7420 |
+
2025-03-04 17:18:36,316 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_25.jpg_r3_c0.png
|
7421 |
+
2025-03-04 17:18:37,482 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_25.jpg_r3_c0.png
|
7422 |
+
2025-03-04 17:18:37,483 [INFO] __main__ - Processing table image: /topic-extraction/img_26.jpg, columns=three
|
7423 |
+
2025-03-04 17:18:39,543 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_26.jpg_r0_c0.png
|
7424 |
+
2025-03-04 17:18:40,901 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_26.jpg_r1_c0.png
|
7425 |
+
2025-03-04 17:18:42,749 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_26.jpg_r1_c1.png
|
7426 |
+
2025-03-04 17:18:44,332 [WARNING] __main__ - Cell image not found: /tmp/tmp22n5c_8q.jpg_rows/row_2/col_0.png
|
7427 |
+
2025-03-04 17:18:44,332 [INFO] __main__ - Processing table image: /topic-extraction/img_27.jpg, columns=three
|
7428 |
+
2025-03-04 17:18:47,634 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r0_c0.png
|
7429 |
+
2025-03-04 17:18:49,048 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r1_c0.png
|
7430 |
+
2025-03-04 17:18:50,572 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r1_c1.png
|
7431 |
+
2025-03-04 17:18:52,196 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r2_c0.png
|
7432 |
+
2025-03-04 17:18:53,636 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r2_c1.png
|
7433 |
+
2025-03-04 17:18:55,054 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r3_c0.png
|
7434 |
+
2025-03-04 17:18:56,002 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_27.jpg_r3_c0.png
|
7435 |
+
2025-03-04 17:18:56,003 [INFO] __main__ - Processing table image: /topic-extraction/img_28.jpg, columns=three
|
7436 |
+
2025-03-04 17:18:58,520 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_28.jpg_r0_c0.png
|
7437 |
+
2025-03-04 17:18:59,970 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_28.jpg_r1_c0.png
|
7438 |
+
2025-03-04 17:19:01,773 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_28.jpg_r1_c1.png
|
7439 |
+
2025-03-04 17:19:03,587 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_28.jpg_r2_c0.png
|
7440 |
+
2025-03-04 17:19:04,755 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_28.jpg_r2_c0.png
|
7441 |
+
2025-03-04 17:19:04,755 [INFO] __main__ - Processing table image: /topic-extraction/img_29.jpg, columns=three
|
7442 |
+
2025-03-04 17:19:06,526 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_29.jpg_r0_c0.png
|
7443 |
+
2025-03-04 17:19:07,817 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_29.jpg_r1_c0.png
|
7444 |
+
2025-03-04 17:19:09,284 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_29.jpg_r1_c1.png
|
7445 |
+
2025-03-04 17:19:10,915 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_29.jpg_r2_c0.png
|
7446 |
+
2025-03-04 17:19:11,969 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_29.jpg_r2_c0.png
|
7447 |
+
2025-03-04 17:19:11,972 [INFO] __main__ - Final subtopics JSON saved locally at /home/user/app/pearson_json/_subtopics.json
|
7448 |
+
2025-03-04 17:19:12,278 [INFO] __main__ - GPU memory cleaned up.
|
7449 |
+
2025-03-04 17:19:12,283 [INFO] __main__ - Processing completed successfully.
|
7450 |
+
2025-03-04 17:28:37,803 [INFO] __main__ - Processing PDF: /home/user/app/input_output/wjec-gce-as-a-economics-specification-from-2015.pdf
|
7451 |
+
2025-03-04 17:28:38,622 [INFO] __main__ - Gemini returned subtopics: {'2.1AS units': [7, 22], '2.2A2 units': [23, 43]}
|
7452 |
+
2025-03-04 17:28:38,624 [INFO] __main__ - Loaded 3543551 bytes from local file '/home/user/app/input_output/wjec-gce-as-a-economics-specification-from-2015.pdf'
|
7453 |
+
2025-03-04 17:28:38,859 [INFO] __main__ - Computed global offset: 0
|
7454 |
+
2025-03-04 17:28:38,860 [INFO] __main__ - Processing pages (0-based): [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42]
|
7455 |
+
2025-03-04 17:29:19,633 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_1.jpg
|
7456 |
+
2025-03-04 17:29:20,237 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_2.jpg
|
7457 |
+
2025-03-04 17:29:20,620 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_3.jpg
|
7458 |
+
2025-03-04 17:29:21,124 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_4.jpg
|
7459 |
+
2025-03-04 17:29:21,413 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_5.jpg
|
7460 |
+
2025-03-04 17:29:21,792 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_6.jpg
|
7461 |
+
2025-03-04 17:29:22,350 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_7.jpg
|
7462 |
+
2025-03-04 17:29:22,827 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_8.jpg
|
7463 |
+
2025-03-04 17:29:23,260 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_9.jpg
|
7464 |
+
2025-03-04 17:29:23,574 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_10.jpg
|
7465 |
+
2025-03-04 17:29:24,083 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_11.jpg
|
7466 |
+
2025-03-04 17:29:24,602 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_12.jpg
|
7467 |
+
2025-03-04 17:29:25,141 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_13.jpg
|
7468 |
+
2025-03-04 17:29:25,442 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_14.jpg
|
7469 |
+
2025-03-04 17:29:26,082 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_15.jpg
|
7470 |
+
2025-03-04 17:29:26,668 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_16.jpg
|
7471 |
+
2025-03-04 17:29:27,176 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_17.jpg
|
7472 |
+
2025-03-04 17:29:27,575 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_18.jpg
|
7473 |
+
2025-03-04 17:29:28,110 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_19.jpg
|
7474 |
+
2025-03-04 17:29:28,509 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_20.jpg
|
7475 |
+
2025-03-04 17:29:29,046 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_21.jpg
|
7476 |
+
2025-03-04 17:29:29,553 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_22.jpg
|
7477 |
+
2025-03-04 17:29:29,936 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_23.jpg
|
7478 |
+
2025-03-04 17:29:30,523 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_24.jpg
|
7479 |
+
2025-03-04 17:29:31,034 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_25.jpg
|
7480 |
+
2025-03-04 17:29:31,417 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_26.jpg
|
7481 |
+
2025-03-04 17:29:31,991 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_27.jpg
|
7482 |
+
2025-03-04 17:29:32,506 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_28.jpg
|
7483 |
+
2025-03-04 17:29:32,884 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_29.jpg
|
7484 |
+
2025-03-04 17:29:33,308 [INFO] __main__ - Classifying images to detect tables.
|
7485 |
+
2025-03-04 17:59:52,883 [INFO] __main__ - GPU memory cleaned up.
|
topic_extraction.py
CHANGED
@@ -299,95 +299,113 @@ def call_gemini_for_subtopic_identification_image(image_data: bytes, api_key: st
|
|
299 |
for attempt in range(max_retries + 1):
|
300 |
try:
|
301 |
prompt = """
|
302 |
-
You are given an image from an educational curriculum specification. The image may contain:
|
303 |
1) A main topic heading in the format: "<number> <Topic Name>", for example "2 Algebra and functions continued".
|
304 |
-
2) A subtopic heading in the format "<number>.<number>", for example "2.5", "2.6",
|
305 |
3) A label-like title in the left column of a two-column table, for example "G2", "G3", "Scarcity, choice and opportunity cost", or similar text without explicit numeric patterns (2.1, 3.4, etc.).
|
306 |
-
4) Possibly no relevant text
|
307 |
|
308 |
Your task is to extract:
|
309 |
- **"title"**: A recognized main topic or heading text.
|
310 |
-
- **"subtopics"**: Any recognized subtopic numbers (e.g. "2.5", "2.6", "3.4"), as an array of strings.
|
311 |
|
312 |
Follow these rules:
|
313 |
|
314 |
-
(1) **If the cell shows a main topic in the format "<number> <Topic Name>",** for example "2 Algebra and functions continued"
|
315 |
-
-
|
316 |
-
-
|
|
|
317 |
|
318 |
-
(2) **If the cell shows one or more subtopic numbers** in the format "<number>.<number>", for example "2.5", "2.6", or "3.4"
|
319 |
- Collect those exact strings in the JSON key "subtopics" (an array of strings).
|
320 |
-
- "title" in this case should be an empty string if you only detect subtopics.
|
321 |
(Example: If text is "2.5 Solve linear inequalities...", then "title" = "", "subtopics" = ["2.5"]).
|
322 |
|
323 |
-
(3) If no main topic or subtopic is detected but the text appears to be a heading
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
|
|
328 |
|
329 |
(4) **If there is no numeric value in the left column** (e.g. "2.1" or "2 <Topic name>" not found) but the left column text appears to be a heading (for instance "Scarcity, choice and opportunity cost"), then:
|
330 |
-
- Use
|
331 |
- "subtopics" remains empty.
|
332 |
-
Example:
|
333 |
If the left column is "Scarcity, choice and opportunity cost" and the right column has definitions, your output is:
|
334 |
{
|
335 |
"title": "Scarcity, choice and opportunity cost",
|
336 |
"subtopics": []
|
337 |
}
|
338 |
|
339 |
-
(5) **If there is
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
340 |
- Put that label text into "title" (e.g. "G2").
|
341 |
- "subtopics" remains empty unless you also see actual subtopic formats like "2.5", "3.4" inside the same cell.
|
342 |
|
343 |
-
(
|
344 |
{
|
345 |
"title": "...",
|
346 |
"subtopics": [...]
|
347 |
}
|
348 |
|
349 |
-
(
|
350 |
-
- Contains no words at all (e.g. a blank white or black image)
|
351 |
-
- Contains only
|
352 |
-
- Contains
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
357 |
|
358 |
**Examples**:
|
359 |
|
360 |
-
- If the image text is
|
361 |
{
|
362 |
"title": "2 Algebra and functions",
|
363 |
"subtopics": []
|
364 |
}
|
365 |
|
366 |
-
- If the image text is
|
367 |
{
|
368 |
"title": "",
|
369 |
"subtopics": ["2.5"]
|
370 |
}
|
371 |
|
372 |
-
- If the image text is
|
373 |
{
|
374 |
-
"title": "
|
375 |
"subtopics": []
|
376 |
}
|
377 |
|
378 |
-
- If the left column says
|
379 |
{
|
380 |
"title": "G2",
|
381 |
"subtopics": []
|
382 |
}
|
383 |
|
384 |
-
- If
|
385 |
{
|
386 |
-
"title": "",
|
387 |
"subtopics": []
|
388 |
}
|
389 |
"""
|
390 |
-
|
391 |
global _GEMINI_CLIENT
|
392 |
if _GEMINI_CLIENT is None:
|
393 |
_GEMINI_CLIENT = genai.Client(api_key=api_key)
|
@@ -482,7 +500,6 @@ class S3ImageWriter(DataWriter):
|
|
482 |
elif cls == "THREE_COLUMN":
|
483 |
info['final_alt'] = "HAS TO BE PROCESSED - three column table"
|
484 |
elif cls == "EMPTY_IMAGE":
|
485 |
-
# Remove markdown reference, delete from descriptions and S3.
|
486 |
md_content = md_content.replace(f"", "")
|
487 |
try:
|
488 |
self.s3_writer.delete(info['s3_path'])
|
@@ -865,7 +882,6 @@ class MineruNoTextProcessor:
|
|
865 |
def process(self, pdf_path: str) -> Dict[str, Any]:
|
866 |
logger.info(f"Processing PDF: {pdf_path}")
|
867 |
try:
|
868 |
-
# Possibly call subtopic_extractor on first pages to find subtopics in the PDF as a whole
|
869 |
subtopics = self.subtopic_extractor.extract_subtopics(pdf_path)
|
870 |
logger.info(f"Gemini returned subtopics: {subtopics}")
|
871 |
|
|
|
299 |
for attempt in range(max_retries + 1):
|
300 |
try:
|
301 |
prompt = """
|
302 |
+
You are given an image from an educational curriculum specification for Gemini Flash 2. The image may contain:
|
303 |
1) A main topic heading in the format: "<number> <Topic Name>", for example "2 Algebra and functions continued".
|
304 |
+
2) A subtopic heading in the format "<number>.<number>" or "<number>.<number>.<number>", for example "2.5", "2.6", "3.4", "2.1.1", "4.3.3" or "1.2.1".
|
305 |
3) A label-like title in the left column of a two-column table, for example "G2", "G3", "Scarcity, choice and opportunity cost", or similar text without explicit numeric patterns (2.1, 3.4, etc.).
|
306 |
+
4) Possibly no relevant text or only truncated text (e.g. "Topics", "Subject content", "What students need to learn", "Content Amplification Additional guidance notes", etc.).
|
307 |
|
308 |
Your task is to extract:
|
309 |
- **"title"**: A recognized main topic or heading text.
|
310 |
+
- **"subtopics"**: Any recognized subtopic numbers (e.g. "2.5", "2.6", "3.4", "G2", "2.1.1", "4.1.1"), as an array of strings.
|
311 |
|
312 |
Follow these rules:
|
313 |
|
314 |
+
(1) **If the cell shows a main topic in the format "<number> <Topic Name>",** for example "2 Algebra and functions continued":
|
315 |
+
- Remove the word "continued" if present.
|
316 |
+
- Put that resulting text in "title". (e.g. "2 Algebra and functions")
|
317 |
+
- "subtopics" should be an empty array, unless smaller subtopic numbers (e.g. "2.5") are also detected in the same text.
|
318 |
|
319 |
+
(2) **If the cell shows one or more subtopic numbers** in the format "<number>.<number>", for example "2.5", "2.6", or "3.4":
|
320 |
- Collect those exact strings in the JSON key "subtopics" (an array of strings).
|
321 |
+
- "title" in this case should be an empty string if you only detect subtopics.
|
322 |
(Example: If text is "2.5 Solve linear inequalities...", then "title" = "", "subtopics" = ["2.5"]).
|
323 |
|
324 |
+
(3) **If no main topic or subtopic is detected but the text appears to be a heading**, for example "Specialisation, division of labour and exchange", then:
|
325 |
+
- Return:
|
326 |
+
{
|
327 |
+
"title": "<the heading text>",
|
328 |
+
"subtopics": []
|
329 |
+
}
|
330 |
|
331 |
(4) **If there is no numeric value in the left column** (e.g. "2.1" or "2 <Topic name>" not found) but the left column text appears to be a heading (for instance "Scarcity, choice and opportunity cost"), then:
|
332 |
+
- Use that left column text as "title".
|
333 |
- "subtopics" remains empty.
|
334 |
+
Example:
|
335 |
If the left column is "Scarcity, choice and opportunity cost" and the right column has definitions, your output is:
|
336 |
{
|
337 |
"title": "Scarcity, choice and opportunity cost",
|
338 |
"subtopics": []
|
339 |
}
|
340 |
|
341 |
+
(5) **If there is no numeric value in the left column** (e.g. "2.1" or "2 <Topic name>" not found) or it appears to be a standalone column with text, treat it as a heading.
|
342 |
+
- "subtopics" remains empty.
|
343 |
+
Example:
|
344 |
+
If there is only one column image that is "Specialisation, devision of labour and exchange" and the right column is not present, your output is:
|
345 |
+
{
|
346 |
+
"title": "Specialisation, devision of labour and exchange",
|
347 |
+
"subtopics": []
|
348 |
+
}
|
349 |
+
|
350 |
+
(6) **If there is a character + digit pattern** in the left column of a two-column table (for example "G2", "G3", "G4", "C1"), treat that as a topic-like label:
|
351 |
- Put that label text into "title" (e.g. "G2").
|
352 |
- "subtopics" remains empty unless you also see actual subtopic formats like "2.5", "3.4" inside the same cell.
|
353 |
|
354 |
+
(7) **Output must be valid JSON** in this exact structure, with no extra text or explanation:
|
355 |
{
|
356 |
"title": "...",
|
357 |
"subtopics": [...]
|
358 |
}
|
359 |
|
360 |
+
(8) **If the image is blank or truncated**, defined as:
|
361 |
+
- Contains no words at all (e.g. a blank white or black image), **OR**
|
362 |
+
- Contains only snippet words/phrases such as "Topics", "Subject content", "Content Amplification Additional guidance notes", "What students need to learn" (including variations in background color), **OR**
|
363 |
+
- Contains partial headings with no recognizable numeric or textual headings
|
364 |
+
- Contains partial UI labels only, such as “Topics” in a gray bar or “What students need to learn” in a blue bar, with no additional meaningful text.
|
365 |
+
then return:
|
366 |
+
{
|
367 |
+
"title": "EMPTY_IMAGE",
|
368 |
+
"subtopics": []
|
369 |
+
}
|
370 |
+
|
371 |
+
(9) **If you cannot recognize any text matching the patterns above**, or the text is too partial/truncated to form a valid heading, also return:
|
372 |
+
{
|
373 |
+
"title": "EMPTY_IMAGE",
|
374 |
+
"subtopics": []
|
375 |
+
}
|
376 |
|
377 |
**Examples**:
|
378 |
|
379 |
+
- If the image text is "2 Algebra and functions continued", return:
|
380 |
{
|
381 |
"title": "2 Algebra and functions",
|
382 |
"subtopics": []
|
383 |
}
|
384 |
|
385 |
+
- If the image text is "2.5 Solve linear and quadratic inequalities ...", return:
|
386 |
{
|
387 |
"title": "",
|
388 |
"subtopics": ["2.5"]
|
389 |
}
|
390 |
|
391 |
+
- If the image text is "Specialisation, division of labour and exchange" (with no numeric patterns at all), return:
|
392 |
{
|
393 |
+
"title": "Specialisation, division of labour and exchange",
|
394 |
"subtopics": []
|
395 |
}
|
396 |
|
397 |
+
- If the left column says "G2" and the right column has details, but no subtopic numbers, return:
|
398 |
{
|
399 |
"title": "G2",
|
400 |
"subtopics": []
|
401 |
}
|
402 |
|
403 |
+
- If the image is blank or shows only partial/truncated snippet words (e.g. "Topics", "Content Amplification Additional guidance notes", "Subject content", "What students need to learn") and nothing else, return:
|
404 |
{
|
405 |
+
"title": "EMPTY_IMAGE",
|
406 |
"subtopics": []
|
407 |
}
|
408 |
"""
|
|
|
409 |
global _GEMINI_CLIENT
|
410 |
if _GEMINI_CLIENT is None:
|
411 |
_GEMINI_CLIENT = genai.Client(api_key=api_key)
|
|
|
500 |
elif cls == "THREE_COLUMN":
|
501 |
info['final_alt'] = "HAS TO BE PROCESSED - three column table"
|
502 |
elif cls == "EMPTY_IMAGE":
|
|
|
503 |
md_content = md_content.replace(f"", "")
|
504 |
try:
|
505 |
self.s3_writer.delete(info['s3_path'])
|
|
|
882 |
def process(self, pdf_path: str) -> Dict[str, Any]:
|
883 |
logger.info(f"Processing PDF: {pdf_path}")
|
884 |
try:
|
|
|
885 |
subtopics = self.subtopic_extractor.extract_subtopics(pdf_path)
|
886 |
logger.info(f"Gemini returned subtopics: {subtopics}")
|
887 |
|
worker.py
CHANGED
@@ -139,6 +139,7 @@ class RabbitMQWorker:
|
|
139 |
try:
|
140 |
pdf_url = file.get("url")
|
141 |
logger.info("[Worker %s] Processing topic extraction for URL: %s", thread_id, pdf_url)
|
|
|
142 |
result = self.topic_processor.process(pdf_url)
|
143 |
context = {
|
144 |
"key": file.get("key", ""),
|
|
|
139 |
try:
|
140 |
pdf_url = file.get("url")
|
141 |
logger.info("[Worker %s] Processing topic extraction for URL: %s", thread_id, pdf_url)
|
142 |
+
|
143 |
result = self.topic_processor.process(pdf_url)
|
144 |
context = {
|
145 |
"key": file.get("key", ""),
|