Arsenii11 commited on
Commit
121f305
·
1 Parent(s): 8e381ca

sort subtopic by num

Browse files
__pycache__/mineru_single.cpython-310.pyc CHANGED
Binary files a/__pycache__/mineru_single.cpython-310.pyc and b/__pycache__/mineru_single.cpython-310.pyc differ
 
topic_extr.py CHANGED
@@ -782,6 +782,119 @@ class MineruNoTextProcessor:
782
  except Exception as e:
783
  logger.error(f"Error during GPU cleanup: {e}")
784
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
785
  def process(self, pdf_path: str) -> Dict[str, Any]:
786
  logger.info(f"Processing PDF: {pdf_path}")
787
  try:
@@ -869,7 +982,12 @@ class MineruNoTextProcessor:
869
  final_markdown = writer.post_process(md_prefix, md_content)
870
 
871
  subtopic_list = list(writer.extracted_subtopics.values())
 
872
 
 
 
 
 
873
  out_path = os.path.join(self.output_folder, "final_subtopics.json")
874
  with open(out_path, "w", encoding="utf-8") as f:
875
  json.dump(subtopic_list, f, indent=2)
 
782
  except Exception as e:
783
  logger.error(f"Error during GPU cleanup: {e}")
784
 
785
+ def unify_topic_name(raw_title: str, children_subtopics: list) -> str:
786
+ """
787
+ Produce a cleaned-up topic name, removing any trailing '... continued'
788
+ and fixing partial or empty titles if it’s obvious from the subtopic numbering.
789
+ E.g. 'gonometry' with children '5.1', '5.2' → '5 Trigonometry'
790
+ """
791
+ title = raw_title.strip()
792
+
793
+ # Remove trailing " continued"
794
+ # E.g. "2 Algebra and functions continued" -> "2 Algebra and functions"
795
+ title = re.sub(r"\s+continued\s*$", "", title, flags=re.IGNORECASE)
796
+
797
+ # If the entire title is missing or obviously broken (like "gonometry"),
798
+ # guess a fix from the subtopics if they share a leading integer.
799
+ # e.g. if subtopics start with "5." => rename to "5 Trigonometry".
800
+ # You can add more sophisticated logic as needed.
801
+ if not title or title.lower().strip() in {"gonometry"}:
802
+ # Try to deduce from subtopic numbering
803
+ # Example: if children are "5.1", "5.2", that suggests a "5 Trigonometry"
804
+ all_subs = [child["title"] for child in children_subtopics]
805
+ # We'll parse the integer part from e.g. "5.1", "5.2"
806
+ # and guess "5 Trigonometry" if they're all "5.xxx".
807
+ if all_subs:
808
+ # Grab the first subtopic
809
+ first_sub = all_subs[0].strip()
810
+ m = re.match(r"^(\d+)\.", first_sub)
811
+ if m:
812
+ parent_num = m.group(1)
813
+ if parent_num == "5":
814
+ title = "5 Trigonometry"
815
+ elif parent_num == "2":
816
+ title = "2 Algebra and functions"
817
+ elif parent_num == "3":
818
+ title = "3 Coordinate geometry in the (x, y) plane"
819
+ elif parent_num == "4":
820
+ title = "4 Statistical distributions"
821
+ # etc., adapt to your needs
822
+ # or leave as e.g. f"{parent_num} ???" if you cannot guess.
823
+
824
+ return title
825
+
826
+
827
+ def merge_topics(subtopic_list: list) -> list:
828
+ """
829
+ 1. Cleans up each topic's title (remove " continued", fix partial titles).
830
+ 2. Merges subtopics under the same cleaned-up parent name.
831
+ 3. Sorts final output in ascending numeric order of the parent's leading number.
832
+ 4. Sorts each parent's children in ascending numeric subtopic order.
833
+ """
834
+ # Dictionary keyed by *cleaned* parent title => {"title": "...", "contents": [...], "children": [...]}
835
+ merged = {}
836
+
837
+ for topic_obj in subtopic_list:
838
+ raw_title = topic_obj.get("title", "")
839
+ children = topic_obj.get("children", [])
840
+ contents = topic_obj.get("contents", [])
841
+
842
+ # Clean up the parent's title
843
+ new_title = unify_topic_name(raw_title, children)
844
+
845
+ # If we have already seen this (cleaned) title, merge
846
+ if new_title not in merged:
847
+ merged[new_title] = {
848
+ "title": new_title,
849
+ "contents": list(contents), # copy
850
+ "children": list(children),
851
+ }
852
+ else:
853
+ # Merge contents and children
854
+ merged[new_title]["contents"].extend(contents)
855
+ merged[new_title]["children"].extend(children)
856
+
857
+ # Next, for each parent's children, we might want to remove duplicates
858
+ # or unify them more. Here we simply unify if they have the same "title".
859
+ # If you have no duplicates, you can skip this loop.
860
+ for par_title, par_info in merged.items():
861
+ # Turn child list into map for merging
862
+ child_map = {}
863
+ for ch in par_info["children"]:
864
+ ctitle = ch.get("title", "").strip()
865
+ if ctitle not in child_map:
866
+ child_map[ctitle] = ch
867
+ else:
868
+ # Merge the "contents" and "children" if needed
869
+ child_map[ctitle]["contents"].extend(ch.get("contents", []))
870
+ child_map[ctitle]["children"].extend(ch.get("children", []))
871
+ # Overwrite the parent's children list with the merged versions
872
+ par_info["children"] = list(child_map.values())
873
+
874
+ # Sort the top-level topics by leading integer (e.g. "2 Algebra" < "5 Trigonometry")
875
+ # We'll parse the first integer from the parent's title, or push them last if no integer found.
876
+ def parse_parent_num(t):
877
+ match = re.match(r"^(\d+)", t)
878
+ return int(match.group(1)) if match else 9999
879
+
880
+ # Build the final list
881
+ final_list = list(merged.values())
882
+ final_list.sort(key=lambda x: parse_parent_num(x["title"]))
883
+
884
+ # Sort each parent's children by their numeric portion. E.g. "2.1" < "2.2" < "3.1"
885
+ def parse_subtopic_num(subtitle):
886
+ # "2.11" => (2, 11), "10.5" => (10, 5)
887
+ # or just parse all groups of digits
888
+ digits = re.findall(r"\d+", subtitle)
889
+ if not digits:
890
+ return (9999,) # if no digits, push to end
891
+ return tuple(int(d) for d in digits)
892
+
893
+ for par_info in final_list:
894
+ par_info["children"].sort(key=lambda ch: parse_subtopic_num(ch["title"]))
895
+
896
+ return final_list
897
+
898
  def process(self, pdf_path: str) -> Dict[str, Any]:
899
  logger.info(f"Processing PDF: {pdf_path}")
900
  try:
 
982
  final_markdown = writer.post_process(md_prefix, md_content)
983
 
984
  subtopic_list = list(writer.extracted_subtopics.values())
985
+ subtopic_list = merge_topics(subtopic_list)
986
 
987
+ # out_path = os.path.join(self.output_folder, "final_subtopics.json")
988
+ # with open(out_path, "w", encoding="utf-8") as f:
989
+ # json.dump(subtopic_list, f, indent=2)
990
+ # logger.info(f"Final subtopics JSON saved locally at {out_path}")
991
  out_path = os.path.join(self.output_folder, "final_subtopics.json")
992
  with open(out_path, "w", encoding="utf-8") as f:
993
  json.dump(subtopic_list, f, indent=2)
topic_extraction.log CHANGED
@@ -5316,3 +5316,245 @@ and series'. Using page 7.
5316
  2025-03-03 17:32:51,859 [INFO] __main__ - Processing table image => img_28.jpg, columns=two
5317
  2025-03-03 17:32:55,099 [INFO] __main__ - GPU memory cleaned up.
5318
  2025-03-03 17:32:55,099 [ERROR] __main__ - Processing failed: 'LocalImageWriter' object has no attribute 'extracted_subtopics'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5316
  2025-03-03 17:32:51,859 [INFO] __main__ - Processing table image => img_28.jpg, columns=two
5317
  2025-03-03 17:32:55,099 [INFO] __main__ - GPU memory cleaned up.
5318
  2025-03-03 17:32:55,099 [ERROR] __main__ - Processing failed: 'LocalImageWriter' object has no attribute 'extracted_subtopics'
5319
+ 2025-03-03 18:01:47,365 [INFO] __main__ - Processing PDF: /home/user/app/input_output/a-level-pearson-mathematics-specification.pdf
5320
+ 2025-03-03 18:01:48,166 [INFO] __main__ - Gemini returned subtopics: {'Paper 1 and Paper 2: Pure Mathematics': [11, 29], 'Paper 3: Statistics and Mechanics': [30, 40]}
5321
+ 2025-03-03 18:01:48,167 [INFO] __main__ - Loaded 1135473 bytes from local file '/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf'
5322
+ 2025-03-03 18:01:48,529 [INFO] __main__ - Computed global offset: 4
5323
+ 2025-03-03 18:01:48,530 [INFO] __main__ - Processing pages (0-based): [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43]
5324
+ 2025-03-03 18:02:45,151 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_1.jpg
5325
+ 2025-03-03 18:02:47,389 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_2.jpg
5326
+ 2025-03-03 18:02:47,996 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_3.jpg
5327
+ 2025-03-03 18:02:48,658 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_4.jpg
5328
+ 2025-03-03 18:02:49,352 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_5.jpg
5329
+ 2025-03-03 18:02:49,960 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_6.jpg
5330
+ 2025-03-03 18:02:50,659 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_7.jpg
5331
+ 2025-03-03 18:02:51,254 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_8.jpg
5332
+ 2025-03-03 18:02:51,742 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_9.jpg
5333
+ 2025-03-03 18:02:52,344 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_10.jpg
5334
+ 2025-03-03 18:02:52,901 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_11.jpg
5335
+ 2025-03-03 18:02:53,548 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_12.jpg
5336
+ 2025-03-03 18:02:54,179 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_13.jpg
5337
+ 2025-03-03 18:02:54,858 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_14.jpg
5338
+ 2025-03-03 18:02:55,462 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_15.jpg
5339
+ 2025-03-03 18:02:56,140 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_16.jpg
5340
+ 2025-03-03 18:02:56,834 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_17.jpg
5341
+ 2025-03-03 18:02:57,186 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_18.jpg
5342
+ 2025-03-03 18:02:57,895 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_19.jpg
5343
+ 2025-03-03 18:02:58,699 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_20.jpg
5344
+ 2025-03-03 18:02:59,469 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_21.jpg
5345
+ 2025-03-03 18:03:00,063 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_22.jpg
5346
+ 2025-03-03 18:03:00,715 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_23.jpg
5347
+ 2025-03-03 18:03:01,305 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_24.jpg
5348
+ 2025-03-03 18:03:01,790 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_25.jpg
5349
+ 2025-03-03 18:03:02,427 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_26.jpg
5350
+ 2025-03-03 18:03:03,086 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_27.jpg
5351
+ 2025-03-03 18:03:03,840 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_28.jpg
5352
+ 2025-03-03 18:03:04,394 [INFO] __main__ - Classifying images to detect tables.
5353
+ 2025-03-03 18:03:09,642 [INFO] __main__ - Processing table image: /topic-extraction/img_1.jpg, columns=three
5354
+ 2025-03-03 18:03:13,344 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r0_c0.png
5355
+ 2025-03-03 18:03:14,713 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r0_c1.png
5356
+ 2025-03-03 18:03:16,386 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r1_c0.png
5357
+ 2025-03-03 18:03:18,238 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r1_c1.png
5358
+ 2025-03-03 18:03:19,729 [INFO] __main__ - Processing table image: /topic-extraction/img_2.jpg, columns=three
5359
+ 2025-03-03 18:03:23,829 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r0_c0.png
5360
+ 2025-03-03 18:03:25,255 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r0_c1.png
5361
+ 2025-03-03 18:03:26,663 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r1_c0.png
5362
+ 2025-03-03 18:03:28,211 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r1_c1.png
5363
+ 2025-03-03 18:03:29,861 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r2_c0.png
5364
+ 2025-03-03 18:03:31,766 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r3_c0.png
5365
+ 2025-03-03 18:03:33,633 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r4_c0.png
5366
+ 2025-03-03 18:03:35,112 [INFO] __main__ - Processing table image: /topic-extraction/img_3.jpg, columns=three
5367
+ 2025-03-03 18:03:38,486 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_3.jpg_r0_c0.png
5368
+ 2025-03-03 18:03:39,547 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_3.jpg_r0_c1.png
5369
+ 2025-03-03 18:03:40,727 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_3.jpg_r1_c0.png
5370
+ 2025-03-03 18:03:42,551 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_3.jpg_r1_c1.png
5371
+ 2025-03-03 18:03:43,977 [INFO] __main__ - Processing table image: /topic-extraction/img_4.jpg, columns=three
5372
+ 2025-03-03 18:03:47,007 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r0_c0.png
5373
+ 2025-03-03 18:03:47,944 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r0_c1.png
5374
+ 2025-03-03 18:03:49,417 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r1_c0.png
5375
+ 2025-03-03 18:03:51,202 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r1_c1.png
5376
+ 2025-03-03 18:03:52,822 [INFO] __main__ - Processing table image: /topic-extraction/img_5.jpg, columns=three
5377
+ 2025-03-03 18:03:57,664 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_5.jpg_r0_c0.png
5378
+ 2025-03-03 18:03:59,070 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_5.jpg_r0_c1.png
5379
+ 2025-03-03 18:04:00,494 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_5.jpg_r1_c0.png
5380
+ 2025-03-03 18:04:02,240 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_5.jpg_r1_c1.png
5381
+ 2025-03-03 18:04:04,099 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_5.jpg_r2_c0.png
5382
+ 2025-03-03 18:04:05,512 [INFO] __main__ - Processing table image: /topic-extraction/img_6.jpg, columns=three
5383
+ 2025-03-03 18:04:09,932 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_6.jpg_r0_c0.png
5384
+ 2025-03-03 18:04:11,364 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_6.jpg_r0_c1.png
5385
+ 2025-03-03 18:04:12,780 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_6.jpg_r1_c0.png
5386
+ 2025-03-03 18:04:14,521 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_6.jpg_r1_c1.png
5387
+ 2025-03-03 18:04:16,038 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_6.jpg_r2_c0.png
5388
+ 2025-03-03 18:04:17,799 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_6.jpg_r2_c1.png
5389
+ 2025-03-03 18:04:19,184 [INFO] __main__ - Processing table image: /topic-extraction/img_7.jpg, columns=three
5390
+ 2025-03-03 18:04:23,663 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r0_c0.png
5391
+ 2025-03-03 18:04:24,739 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r0_c1.png
5392
+ 2025-03-03 18:04:26,232 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r1_c0.png
5393
+ 2025-03-03 18:04:28,388 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r1_c1.png
5394
+ 2025-03-03 18:04:30,206 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r2_c0.png
5395
+ 2025-03-03 18:04:31,473 [INFO] __main__ - Processing table image: /topic-extraction/img_8.jpg, columns=three
5396
+ 2025-03-03 18:04:34,576 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r0_c0.png
5397
+ 2025-03-03 18:04:35,800 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r0_c1.png
5398
+ 2025-03-03 18:04:37,238 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r0_c2.png
5399
+ 2025-03-03 18:04:38,721 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r1_c0.png
5400
+ 2025-03-03 18:04:40,069 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r1_c1.png
5401
+ 2025-03-03 18:04:41,915 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r1_c2.png
5402
+ 2025-03-03 18:04:43,317 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r2_c0.png
5403
+ 2025-03-03 18:04:44,758 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r2_c1.png
5404
+ 2025-03-03 18:04:46,354 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r3_c0.png
5405
+ 2025-03-03 18:04:47,962 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r3_c1.png
5406
+ 2025-03-03 18:04:49,441 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r4_c0.png
5407
+ 2025-03-03 18:04:51,291 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r4_c1.png
5408
+ 2025-03-03 18:04:52,840 [INFO] __main__ - Processing table image: /topic-extraction/img_9.jpg, columns=two
5409
+ 2025-03-03 18:04:56,905 [WARNING] __main__ - Cell image not found: /tmp/tmp5hkh4jpv.jpg_rows/row_0/col_0.png
5410
+ 2025-03-03 18:04:57,168 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r0_c1.png
5411
+ 2025-03-03 18:04:58,554 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r1_c0.png
5412
+ 2025-03-03 18:05:00,216 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r1_c1.png
5413
+ 2025-03-03 18:05:02,113 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r2_c0.png
5414
+ 2025-03-03 18:05:03,482 [INFO] __main__ - Processing table image: /topic-extraction/img_10.jpg, columns=three
5415
+ 2025-03-03 18:05:07,505 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_10.jpg_r0_c0.png
5416
+ 2025-03-03 18:05:08,634 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_10.jpg_r0_c1.png
5417
+ 2025-03-03 18:05:09,820 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_10.jpg_r1_c0.png
5418
+ 2025-03-03 18:05:11,434 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_10.jpg_r1_c1.png
5419
+ 2025-03-03 18:05:13,181 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_10.jpg_r2_c0.png
5420
+ 2025-03-03 18:05:15,198 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_10.jpg_r3_c0.png
5421
+ 2025-03-03 18:05:16,840 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_10.jpg_r4_c0.png
5422
+ 2025-03-03 18:05:18,564 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_10.jpg_r5_c0.png
5423
+ 2025-03-03 18:05:19,773 [INFO] __main__ - Processing table image: /topic-extraction/img_11.jpg, columns=two
5424
+ 2025-03-03 18:05:23,555 [WARNING] __main__ - Cell image not found: /tmp/tmpmqfa7baf.jpg_rows/row_0/col_0.png
5425
+ 2025-03-03 18:05:23,816 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_11.jpg_r0_c1.png
5426
+ 2025-03-03 18:05:25,422 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_11.jpg_r1_c0.png
5427
+ 2025-03-03 18:05:27,100 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_11.jpg_r2_c0.png
5428
+ 2025-03-03 18:05:28,749 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_11.jpg_r3_c0.png
5429
+ 2025-03-03 18:05:30,528 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_11.jpg_r4_c0.png
5430
+ 2025-03-03 18:05:32,582 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_11.jpg_r5_c0.png
5431
+ 2025-03-03 18:05:34,027 [INFO] __main__ - Processing table image: /topic-extraction/img_12.jpg, columns=three
5432
+ 2025-03-03 18:05:38,149 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_12.jpg_r0_c0.png
5433
+ 2025-03-03 18:05:39,165 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_12.jpg_r0_c1.png
5434
+ 2025-03-03 18:05:40,592 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_12.jpg_r1_c0.png
5435
+ 2025-03-03 18:05:42,353 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_12.jpg_r1_c1.png
5436
+ 2025-03-03 18:05:44,001 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_12.jpg_r2_c0.png
5437
+ 2025-03-03 18:05:45,790 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_12.jpg_r2_c1.png
5438
+ 2025-03-03 18:05:47,303 [INFO] __main__ - Processing table image: /topic-extraction/img_13.jpg, columns=three
5439
+ 2025-03-03 18:05:50,408 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r0_c0.png
5440
+ 2025-03-03 18:05:51,470 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r0_c1.png
5441
+ 2025-03-03 18:05:53,168 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r1_c0.png
5442
+ 2025-03-03 18:05:54,502 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r1_c1.png
5443
+ 2025-03-03 18:05:56,341 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r2_c0.png
5444
+ 2025-03-03 18:05:58,223 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r3_c0.png
5445
+ 2025-03-03 18:05:59,584 [INFO] __main__ - Processing table image: /topic-extraction/img_14.jpg, columns=three
5446
+ 2025-03-03 18:06:02,910 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_14.jpg_r0_c0.png
5447
+ 2025-03-03 18:06:03,811 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_14.jpg_r0_c1.png
5448
+ 2025-03-03 18:06:04,999 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_14.jpg_r1_c0.png
5449
+ 2025-03-03 18:06:06,682 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_14.jpg_r1_c1.png
5450
+ 2025-03-03 18:06:08,714 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_14.jpg_r2_c0.png
5451
+ 2025-03-03 18:06:10,540 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_14.jpg_r3_c0.png
5452
+ 2025-03-03 18:06:12,055 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_14.jpg_r4_c0.png
5453
+ 2025-03-03 18:06:13,546 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_14.jpg_r4_c1.png
5454
+ 2025-03-03 18:06:14,846 [INFO] __main__ - Processing table image: /topic-extraction/img_15.jpg, columns=three
5455
+ 2025-03-03 18:06:18,269 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r0_c0.png
5456
+ 2025-03-03 18:06:19,578 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r0_c1.png
5457
+ 2025-03-03 18:06:20,905 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r0_c2.png
5458
+ 2025-03-03 18:06:22,274 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r1_c0.png
5459
+ 2025-03-03 18:06:23,653 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r1_c1.png
5460
+ 2025-03-03 18:06:25,318 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r1_c2.png
5461
+ 2025-03-03 18:06:26,709 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r2_c0.png
5462
+ 2025-03-03 18:06:28,080 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r2_c1.png
5463
+ 2025-03-03 18:06:29,474 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r3_c0.png
5464
+ 2025-03-03 18:06:31,739 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r3_c1.png
5465
+ 2025-03-03 18:06:33,031 [INFO] __main__ - Processing table image: /topic-extraction/img_16.jpg, columns=three
5466
+ 2025-03-03 18:06:36,448 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r0_c0.png
5467
+ 2025-03-03 18:06:37,446 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r0_c1.png
5468
+ 2025-03-03 18:06:38,581 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r1_c0.png
5469
+ 2025-03-03 18:06:40,126 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r1_c1.png
5470
+ 2025-03-03 18:06:41,796 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r2_c0.png
5471
+ 2025-03-03 18:06:43,330 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r3_c0.png
5472
+ 2025-03-03 18:06:45,025 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r3_c1.png
5473
+ 2025-03-03 18:06:46,832 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r4_c0.png
5474
+ 2025-03-03 18:06:48,096 [INFO] __main__ - Processing table image: /topic-extraction/img_17.jpg, columns=three
5475
+ 2025-03-03 18:06:51,269 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r0_c0.png
5476
+ 2025-03-03 18:06:52,315 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r0_c1.png
5477
+ 2025-03-03 18:06:53,734 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r1_c0.png
5478
+ 2025-03-03 18:06:55,297 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r1_c1.png
5479
+ 2025-03-03 18:06:57,056 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r2_c0.png
5480
+ 2025-03-03 18:06:58,396 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r3_c0.png
5481
+ 2025-03-03 18:06:59,922 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r3_c1.png
5482
+ 2025-03-03 18:07:01,718 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r4_c0.png
5483
+ 2025-03-03 18:07:03,388 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r5_c0.png
5484
+ 2025-03-03 18:07:04,651 [INFO] __main__ - Processing table image: /topic-extraction/img_18.jpg, columns=three
5485
+ 2025-03-03 18:07:05,752 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r0_c0.png
5486
+ 2025-03-03 18:07:06,841 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r0_c1.png
5487
+ 2025-03-03 18:07:08,263 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r1_c0.png
5488
+ 2025-03-03 18:07:09,611 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r1_c1.png
5489
+ 2025-03-03 18:07:11,064 [INFO] __main__ - Processing table image: /topic-extraction/img_19.jpg, columns=three
5490
+ 2025-03-03 18:07:13,613 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r0_c0.png
5491
+ 2025-03-03 18:07:14,973 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r0_c1.png
5492
+ 2025-03-03 18:07:16,339 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r1_c0.png
5493
+ 2025-03-03 18:07:18,493 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r1_c1.png
5494
+ 2025-03-03 18:07:20,139 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r2_c0.png
5495
+ 2025-03-03 18:07:21,807 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r2_c1.png
5496
+ 2025-03-03 18:07:23,091 [INFO] __main__ - Processing table image: /topic-extraction/img_20.jpg, columns=three
5497
+ 2025-03-03 18:07:26,528 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_20.jpg_r0_c0.png
5498
+ 2025-03-03 18:07:27,947 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_20.jpg_r0_c1.png
5499
+ 2025-03-03 18:07:29,439 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_20.jpg_r1_c0.png
5500
+ 2025-03-03 18:07:31,257 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_20.jpg_r1_c1.png
5501
+ 2025-03-03 18:07:32,705 [INFO] __main__ - Processing table image: /topic-extraction/img_21.jpg, columns=three
5502
+ 2025-03-03 18:07:35,869 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_21.jpg_r0_c0.png
5503
+ 2025-03-03 18:07:37,403 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_21.jpg_r0_c1.png
5504
+ 2025-03-03 18:07:38,804 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_21.jpg_r1_c0.png
5505
+ 2025-03-03 18:07:40,651 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_21.jpg_r1_c1.png
5506
+ 2025-03-03 18:07:42,292 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_21.jpg_r2_c0.png
5507
+ 2025-03-03 18:07:43,964 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_21.jpg_r2_c1.png
5508
+ 2025-03-03 18:07:45,304 [INFO] __main__ - Processing table image: /topic-extraction/img_22.jpg, columns=three
5509
+ 2025-03-03 18:07:48,679 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_22.jpg_r0_c0.png
5510
+ 2025-03-03 18:07:49,776 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_22.jpg_r0_c1.png
5511
+ 2025-03-03 18:07:51,108 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_22.jpg_r1_c0.png
5512
+ 2025-03-03 18:07:52,641 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_22.jpg_r1_c1.png
5513
+ 2025-03-03 18:07:54,136 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_22.jpg_r2_c0.png
5514
+ 2025-03-03 18:07:55,772 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_22.jpg_r2_c1.png
5515
+ 2025-03-03 18:07:57,135 [INFO] __main__ - Processing table image: /topic-extraction/img_23.jpg, columns=three
5516
+ 2025-03-03 18:08:00,271 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_23.jpg_r0_c0.png
5517
+ 2025-03-03 18:08:01,671 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_23.jpg_r0_c1.png
5518
+ 2025-03-03 18:08:03,176 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_23.jpg_r1_c0.png
5519
+ 2025-03-03 18:08:04,801 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_23.jpg_r1_c1.png
5520
+ 2025-03-03 18:08:06,459 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_23.jpg_r2_c0.png
5521
+ 2025-03-03 18:08:08,351 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_23.jpg_r2_c1.png
5522
+ 2025-03-03 18:08:10,016 [INFO] __main__ - Processing table image: /topic-extraction/img_24.jpg, columns=three
5523
+ 2025-03-03 18:08:13,320 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_24.jpg_r0_c0.png
5524
+ 2025-03-03 18:08:14,451 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_24.jpg_r0_c1.png
5525
+ 2025-03-03 18:08:15,533 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_24.jpg_r1_c0.png
5526
+ 2025-03-03 18:08:17,196 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_24.jpg_r1_c1.png
5527
+ 2025-03-03 18:08:19,105 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_24.jpg_r2_c0.png
5528
+ 2025-03-03 18:08:20,442 [INFO] __main__ - Processing table image: /topic-extraction/img_25.jpg, columns=three
5529
+ 2025-03-03 18:08:23,613 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_25.jpg_r0_c0.png
5530
+ 2025-03-03 18:08:24,927 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_25.jpg_r0_c1.png
5531
+ 2025-03-03 18:08:26,271 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_25.jpg_r0_c2.png
5532
+ 2025-03-03 18:08:27,756 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_25.jpg_r1_c0.png
5533
+ 2025-03-03 18:08:29,059 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_25.jpg_r1_c1.png
5534
+ 2025-03-03 18:08:30,619 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_25.jpg_r1_c2.png
5535
+ 2025-03-03 18:08:32,028 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_25.jpg_r2_c0.png
5536
+ 2025-03-03 18:08:33,702 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_25.jpg_r2_c1.png
5537
+ 2025-03-03 18:08:35,546 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_25.jpg_r3_c0.png
5538
+ 2025-03-03 18:08:37,241 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_25.jpg_r3_c1.png
5539
+ 2025-03-03 18:08:38,602 [INFO] __main__ - Processing table image: /topic-extraction/img_26.jpg, columns=three
5540
+ 2025-03-03 18:08:41,789 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_26.jpg_r0_c0.png
5541
+ 2025-03-03 18:08:42,904 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_26.jpg_r0_c1.png
5542
+ 2025-03-03 18:08:44,299 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_26.jpg_r1_c0.png
5543
+ 2025-03-03 18:08:45,765 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_26.jpg_r1_c1.png
5544
+ 2025-03-03 18:08:47,625 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_26.jpg_r2_c0.png
5545
+ 2025-03-03 18:08:49,450 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_26.jpg_r3_c0.png
5546
+ 2025-03-03 18:08:50,706 [INFO] __main__ - Processing table image: /topic-extraction/img_27.jpg, columns=three
5547
+ 2025-03-03 18:08:53,864 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r0_c0.png
5548
+ 2025-03-03 18:08:55,294 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r0_c1.png
5549
+ 2025-03-03 18:08:56,673 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r1_c0.png
5550
+ 2025-03-03 18:08:58,397 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r1_c1.png
5551
+ 2025-03-03 18:09:00,147 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r2_c0.png
5552
+ 2025-03-03 18:09:01,840 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r3_c0.png
5553
+ 2025-03-03 18:09:03,256 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r4_c0.png
5554
+ 2025-03-03 18:09:04,820 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r4_c1.png
5555
+ 2025-03-03 18:09:06,037 [INFO] __main__ - Processing table image: /topic-extraction/img_28.jpg, columns=two
5556
+ 2025-03-03 18:09:09,419 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_28.jpg_r0_c0.png
5557
+ 2025-03-03 18:09:11,243 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_28.jpg_r1_c0.png
5558
+ 2025-03-03 18:09:13,257 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_28.jpg_r2_c0.png
5559
+ 2025-03-03 18:09:15,022 [INFO] __main__ - GPU memory cleaned up.
5560
+ 2025-03-03 18:09:15,023 [ERROR] __main__ - Processing failed: name 'merge_topics' is not defined