MinerU

Paused

App Files Files Community

SkyNait commited on Mar 4

Commit

f81cfef

1 Parent(s): ae8cbf3

enhanced row output filtering

Browse files

Files changed (9) hide show

__pycache__/inference_svm_model.cpython-310.pyc +0 -0
__pycache__/mineru_single.cpython-310.pyc +0 -0
__pycache__/table_row_extraction.cpython-310.pyc +0 -0
__pycache__/topic_extraction.cpython-310.pyc +0 -0
__pycache__/worker.cpython-310.pyc +0 -0
output1.pdf +0 -3
pearson_json/_subtopics.json +218 -56
topic_extr.py +989 -0
topic_extraction.log +234 -0

__pycache__/inference_svm_model.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/inference_svm_model.cpython-310.pyc and b/__pycache__/inference_svm_model.cpython-310.pyc differ

__pycache__/mineru_single.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/mineru_single.cpython-310.pyc and b/__pycache__/mineru_single.cpython-310.pyc differ

__pycache__/table_row_extraction.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/table_row_extraction.cpython-310.pyc and b/__pycache__/table_row_extraction.cpython-310.pyc differ

__pycache__/topic_extraction.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/topic_extraction.cpython-310.pyc and b/__pycache__/topic_extraction.cpython-310.pyc differ

__pycache__/worker.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/worker.cpython-310.pyc and b/__pycache__/worker.cpython-310.pyc differ

output1.pdf DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2b2f32c4f39c66673ac775c4061a57259a92b5fc69e81fec46374a9a0eb492b2
-size 123145

pearson_json/_subtopics.json CHANGED Viewed

@@ -1,122 +1,284 @@
 [
   {
-    "title": "Content",
     "contents": [
       {
         "type": "image",
-        "key": "/topic-extraction/cells/img_1.jpg_r0_c0.png"
-      },
       {
         "type": "image",
         "key": "/topic-extraction/cells/img_2.jpg_r0_c0.png"
       },
       {
         "type": "image",
-        "key": "/topic-extraction/cells/img_3.jpg_r0_c0.png"
-      },
       {
         "type": "image",
-        "key": "/topic-extraction/cells/img_4.jpg_r1_c0.png"
-      },
       {
         "type": "image",
-        "key": "/topic-extraction/cells/img_5.jpg_r0_c0.png"
-      },
       {
         "type": "image",
-        "key": "/topic-extraction/cells/img_6.jpg_r0_c0.png"
-      },
       {
         "type": "image",
-        "key": "/topic-extraction/cells/img_7.jpg_r0_c0.png"
-      },
       {
         "type": "image",
-        "key": "/topic-extraction/cells/img_8.jpg_r1_c0.png"
-      },
       {
         "type": "image",
-        "key": "/topic-extraction/cells/img_9.jpg_r0_c0.png"
-      },
       {
         "type": "image",
-        "key": "/topic-extraction/cells/img_10.jpg_r0_c0.png"
-      },
       {
         "type": "image",
-        "key": "/topic-extraction/cells/img_11.jpg_r0_c0.png"
-      },
       {
         "type": "image",
-        "key": "/topic-extraction/cells/img_12.jpg_r0_c0.png"
-      },
       {
         "type": "image",
-        "key": "/topic-extraction/cells/img_13.jpg_r0_c1.png"
-      },
       {
         "type": "image",
-        "key": "/topic-extraction/cells/img_14.jpg_r0_c0.png"
-      },
       {
         "type": "image",
-        "key": "/topic-extraction/cells/img_15.jpg_r0_c0.png"
-      },
       {
         "type": "image",
-        "key": "/topic-extraction/cells/img_16.jpg_r0_c0.png"
-      },
       {
         "type": "image",
-        "key": "/topic-extraction/cells/img_17.jpg_r1_c0.png"
-      },
       {
         "type": "image",
-        "key": "/topic-extraction/cells/img_18.jpg_r0_c0.png"
-      },
       {
         "type": "image",
-        "key": "/topic-extraction/cells/img_19.jpg_r0_c0.png"
-      },
       {
         "type": "image",
-        "key": "/topic-extraction/cells/img_20.jpg_r0_c0.png"
-      },
       {
         "type": "image",
-        "key": "/topic-extraction/cells/img_21.jpg_r0_c0.png"
-      },
       {
         "type": "image",
-        "key": "/topic-extraction/cells/img_22.jpg_r0_c0.png"
-      },
       {
         "type": "image",
-        "key": "/topic-extraction/cells/img_23.jpg_r0_c0.png"
-      },
       {
         "type": "image",
-        "key": "/topic-extraction/cells/img_24.jpg_r0_c0.png"
-      },
       {
         "type": "image",
-        "key": "/topic-extraction/cells/img_25.jpg_r1_c0.png"
-      },
       {
         "type": "image",
-        "key": "/topic-extraction/cells/img_26.jpg_r0_c0.png"
-      },
       {
         "type": "image",
-        "key": "/topic-extraction/cells/img_27.jpg_r0_c0.png"
-      },
       {
         "type": "image",
-        "key": "/topic-extraction/cells/img_28.jpg_r0_c0.png"
-      },
       {
         "type": "image",
-        "key": "/topic-extraction/cells/img_29.jpg_r0_c0.png"
       }
     ],
     "children": []

 [
   {
+    "title": "Scarcity, choice and opportunity cost",
     "contents": [
       {
         "type": "image",
+        "key": "/topic-extraction/cells/img_1.jpg_r1_c0.png"
+      }
+    ],
+    "children": []
+  },
+  {
+    "title": "Content Amplification Additional guidance notes",
+    "contents": [
       {
         "type": "image",
         "key": "/topic-extraction/cells/img_2.jpg_r0_c0.png"
       },
       {
         "type": "image",
+        "key": "/topic-extraction/cells/img_18.jpg_r0_c0.png"
+      }
+    ],
+    "children": []
+  },
+  {
+    "title": "Price, income and cross price elasticities of demand, price elasticity of supply",
+    "contents": [
       {
         "type": "image",
+        "key": "/topic-extraction/cells/img_3.jpg_r1_c0.png"
+      }
+    ],
+    "children": []
+  },
+  {
+    "title": "Wage determination",
+    "contents": [
       {
         "type": "image",
+        "key": "/topic-extraction/cells/img_4.jpg_r2_c0.png"
+      }
+    ],
+    "children": []
+  },
+  {
+    "title": "How resources are allocated in a free market economy",
+    "contents": [
       {
         "type": "image",
+        "key": "/topic-extraction/cells/img_5.jpg_r1_c0.png"
+      }
+    ],
+    "children": []
+  },
+  {
+    "title": "Understanding market failure",
+    "contents": [
       {
         "type": "image",
+        "key": "/topic-extraction/cells/img_6.jpg_r1_c0.png"
+      }
+    ],
+    "children": []
+  },
+  {
+    "title": "Why and how governments intervene in markets",
+    "contents": [
       {
         "type": "image",
+        "key": "/topic-extraction/cells/img_7.jpg_r1_c0.png"
+      }
+    ],
+    "children": []
+  },
+  {
+    "title": "The circular flow of income model",
+    "contents": [
       {
         "type": "image",
+        "key": "/topic-extraction/cells/img_8.jpg_r2_c0.png"
+      }
+    ],
+    "children": []
+  },
+  {
+    "title": "The AD function",
+    "contents": [
       {
         "type": "image",
+        "key": "/topic-extraction/cells/img_9.jpg_r1_c0.png"
+      }
+    ],
+    "children": []
+  },
+  {
+    "title": "Government policy objectives",
+    "contents": [
       {
         "type": "image",
+        "key": "/topic-extraction/cells/img_10.jpg_r1_c0.png"
+      }
+    ],
+    "children": []
+  },
+  {
+    "title": "Fiscal policy",
+    "contents": [
       {
         "type": "image",
+        "key": "/topic-extraction/cells/img_11.jpg_r1_c0.png"
+      }
+    ],
+    "children": []
+  },
+  {
+    "title": "Monetary policy",
+    "contents": [
       {
         "type": "image",
+        "key": "/topic-extraction/cells/img_12.jpg_r1_c0.png"
+      }
+    ],
+    "children": []
+  },
+  {
+    "title": "Exchange rates and exchange rate policy",
+    "contents": [
       {
         "type": "image",
+        "key": "/topic-extraction/cells/img_13.jpg_r1_c0.png"
+      }
+    ],
+    "children": []
+  },
+  {
+    "title": "Free trade and protectionism",
+    "contents": [
       {
         "type": "image",
+        "key": "/topic-extraction/cells/img_14.jpg_r1_c0.png"
+      }
+    ],
+    "children": []
+  },
+  {
+    "title": "Costs, revenues and profits",
+    "contents": [
       {
         "type": "image",
+        "key": "/topic-extraction/cells/img_15.jpg_r1_c0.png"
+      }
+    ],
+    "children": []
+  },
+  {
+    "title": "Background to market structures",
+    "contents": [
       {
         "type": "image",
+        "key": "/topic-extraction/cells/img_16.jpg_r1_c0.png"
+      }
+    ],
+    "children": []
+  },
+  {
+    "title": "Monopoly",
+    "contents": [
       {
         "type": "image",
+        "key": "/topic-extraction/cells/img_17.jpg_r2_c0.png"
+      }
+    ],
+    "children": []
+  },
+  {
+    "title": "Short run aggregate supply (SRAS)",
+    "contents": [
       {
         "type": "image",
+        "key": "/topic-extraction/cells/img_19.jpg_r1_c0.png"
+      }
+    ],
+    "children": []
+  },
+  {
+    "title": "The short run Phillips curve",
+    "contents": [
       {
         "type": "image",
+        "key": "/topic-extraction/cells/img_20.jpg_r1_c0.png"
+      }
+    ],
+    "children": []
+  },
+  {
+    "title": "Economic growth",
+    "contents": [
       {
         "type": "image",
+        "key": "/topic-extraction/cells/img_21.jpg_r1_c0.png"
+      }
+    ],
+    "children": []
+  },
+  {
+    "title": "Unemployment",
+    "contents": [
       {
         "type": "image",
+        "key": "/topic-extraction/cells/img_22.jpg_r1_c0.png"
+      }
+    ],
+    "children": []
+  },
+  {
+    "title": "Solutions",
+    "contents": [
       {
         "type": "image",
+        "key": "/topic-extraction/cells/img_23.jpg_r1_c0.png"
+      }
+    ],
+    "children": []
+  },
+  {
+    "title": "Inflation and deflation",
+    "contents": [
       {
         "type": "image",
+        "key": "/topic-extraction/cells/img_24.jpg_r1_c0.png"
+      }
+    ],
+    "children": []
+  },
+  {
+    "title": "The balance of payments",
+    "contents": [
       {
         "type": "image",
+        "key": "/topic-extraction/cells/img_25.jpg_r2_c0.png"
+      }
+    ],
+    "children": []
+  },
+  {
+    "title": "Control of the national (public sector) debt",
+    "contents": [
       {
         "type": "image",
+        "key": "/topic-extraction/cells/img_26.jpg_r1_c0.png"
+      }
+    ],
+    "children": []
+  },
+  {
+    "title": "The operation of monetary policy and monetary stability",
+    "contents": [
       {
         "type": "image",
+        "key": "/topic-extraction/cells/img_27.jpg_r1_c0.png"
+      }
+    ],
+    "children": []
+  },
+  {
+    "title": "Advantages and disadvantages of free trade",
+    "contents": [
       {
         "type": "image",
+        "key": "/topic-extraction/cells/img_28.jpg_r1_c0.png"
+      }
+    ],
+    "children": []
+  },
+  {
+    "title": "European Union",
+    "contents": [
       {
         "type": "image",
+        "key": "/topic-extraction/cells/img_29.jpg_r1_c0.png"
       }
     ],
     "children": []

topic_extr.py ADDED Viewed

	@@ -0,0 +1,989 @@

+#!/usr/bin/env python3
+import os
+import re
+import gc
+import json
+import logging
+import fitz
+import boto3
+import base64
+import time
+import asyncio
+import tempfile
+import requests
+from io import BytesIO
+from typing import List, Dict, Any
+import torch
+import cv2
+import numpy as np
+from google import genai
+from google.genai import types
+from magic_pdf.data.dataset import PymuDocDataset
+from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+from magic_pdf.data.data_reader_writer.base import DataWriter
+from table_row_extraction import TableExtractor
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+file_handler = logging.FileHandler("topic_extraction.log")
+file_handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(name)s - %(message)s"))
+logger.addHandler(file_handler)
+_GEMINI_CLIENT = None
+# helper functions, also global
+def unify_whitespace(text: str) -> str:
+    return re.sub(r"\s+", " ", text).strip()
+def find_all_occurrences(pdf_bytes: bytes, search_text: str) -> List[int]:
+    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+    st_norm = unify_whitespace(search_text)
+    found = []
+    for i in range(doc.page_count):
+        raw = doc[i].get_text("raw")
+        norm = unify_whitespace(raw)
+        if st_norm in norm:
+            found.append(i)
+    doc.close()
+    return sorted(found)
+def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> bytes:
+    if not page_indices:
+        raise ValueError("No page indices provided for subset creation.")
+    doc = fitz.open(stream=original_pdf_bytes, filetype="pdf")
+    new_doc = fitz.open()
+    for p in sorted(set(page_indices)):
+        if 0 <= p < doc.page_count:
+            new_doc.insert_pdf(doc, from_page=p, to_page=p)
+        else:
+            logger.error(f"Page index {p} out of range (0..{doc.page_count - 1}).")
+            raise ValueError(f"Page index {p} out of range.")
+    subset_bytes = new_doc.tobytes()
+    new_doc.close()
+    doc.close()
+    return subset_bytes
+def unify_topic_name(raw_title: str, children_subtopics: list) -> str:
+    """
+    Clean up a topic title:
+    - Remove any trailing "continued".
+    - If the title does not start with a number but children provide a consistent numeric prefix,
+      then prepend that prefix.
+    """
+    title = raw_title.strip()
+    # Remove trailing "continued"
+    title = re.sub(r"\s+continued\s*$", "", title, flags=re.IGNORECASE)
+    # If title already starts with a number, use it as is.
+    if re.match(r"^\d+", title):
+        return title
+    # Otherwise, try to deduce a numeric prefix from the children.
+    prefixes = []
+    for child in children_subtopics:
+        child_title = child.get("title", "").strip()
+        m = re.match(r"^(\d+)\.", child_title)
+        if m:
+            prefixes.append(m.group(1))
+    if prefixes:
+        # If all numeric prefixes in children are the same, use that prefix.
+        if all(p == prefixes[0] for p in prefixes):
+            # If title is non-empty, prepend the number; otherwise, use a fallback.
+            if title:
+                title = f"{prefixes[0]} {title}"
+            else:
+                title = f"{prefixes[0]} Topic"
+    # Optionally, handle known broken titles explicitly.
+    if title.lower() in {"gonometry"}:
+        # For example, if children indicate "5.X", set to "5 Trigonometry"
+        if prefixes and prefixes[0] == "5":
+            title = "5 Trigonometry"
+    return title
+def merge_topics(subtopic_list: list) -> list:
+    """
+    Merge topics with an enhanced logic:
+    1. Clean up each topic's title using unify_topic_name.
+    2. Group topics by the parent's numeric prefix (if available). Topics without a numeric prefix use their title.
+    3. Reassign children: for each child whose title (e.g. "2.1") does not match its current parent's numeric prefix,
+       move it to the parent with the matching prefix if available.
+    4. Remove duplicate children by merging contents.
+    5. Sort parent topics and each parent's children by their numeric ordering.
+    """
+    # First, merge topics by parent's numeric prefix.
+    merged = {}
+    for topic_obj in subtopic_list:
+        raw_title = topic_obj.get("title", "")
+        children = topic_obj.get("children", [])
+        contents = topic_obj.get("contents", [])
+        new_title = unify_topic_name(raw_title, children)
+        # Extract parent's numeric prefix, if present.
+        m = re.match(r"^(\d+)", new_title)
+        parent_prefix = m.group(1) if m else None
+        key = parent_prefix if parent_prefix is not None else new_title
+        if key not in merged:
+            merged[key] = {
+                "title": new_title,
+                "contents": list(contents),
+                "children": list(children),
+            }
+        else:
+            # Merge contents and children; choose the longer title.
+            if len(new_title) > len(merged[key]["title"]):
+                merged[key]["title"] = new_title
+            merged[key]["contents"].extend(contents)
+            merged[key]["children"].extend(children)
+    # Build a lookup of merged topics by their numeric prefix.
+    parent_lookup = merged  # keys are numeric prefixes or the full title for non-numeric ones.
+    # Reassign children to the correct parent based on their numeric prefix.
+    for key, topic in merged.items():
+        new_children = []
+        for child in topic["children"]:
+            child_title = child.get("title", "").strip()
+            m_child = re.match(r"^(\d+)\.", child_title)
+            if m_child:
+                child_prefix = m_child.group(1)
+                if key != child_prefix and child_prefix in parent_lookup:
+                    # Reassign this child to the proper parent.
+                    parent_lookup[child_prefix]["children"].append(child)
+                    continue
+            new_children.append(child)
+        topic["children"] = new_children
+    # Remove duplicate children by merging their contents.
+    for topic in merged.values():
+        child_map = {}
+        for child in topic["children"]:
+            ctitle = child.get("title", "").strip()
+            if ctitle not in child_map:
+                child_map[ctitle] = child
+            else:
+                child_map[ctitle]["contents"].extend(child.get("contents", []))
+                child_map[ctitle]["children"].extend(child.get("children", []))
+        topic["children"] = list(child_map.values())
+        # Sort children by full numeric order (e.g. "2.1" < "2.10" < "2.2").
+        def parse_subtopic_num(subtitle):
+            digits = re.findall(r"\d+", subtitle)
+            return tuple(int(d) for d in digits) if digits else (9999,)
+        topic["children"].sort(key=lambda ch: parse_subtopic_num(ch.get("title", "")))
+    # Convert merged topics to a sorted list.
+    def parse_parent_num(topic):
+        m = re.match(r"^(\d+)", topic.get("title", ""))
+        return int(m.group(1)) if m else 9999
+    final_list = list(merged.values())
+    final_list.sort(key=lambda topic: parse_parent_num(topic))
+    return final_list
+class s3Writer:
+    def __init__(self, ak: str, sk: str, bucket: str, endpoint_url: str):
+        self.bucket = bucket
+        self.client = boto3.client(
+            's3',
+            aws_access_key_id=ak,
+            aws_secret_access_key=sk,
+            endpoint_url=endpoint_url
+        )
+    def write(self, path: str, data: bytes) -> None:
+        try:
+            file_obj = BytesIO(data)
+            self.client.upload_fileobj(
+                file_obj,
+                self.bucket,
+                path
+            )
+            logger.info(f"Uploaded to S3: {path}")
+        except Exception as e:
+            logger.error(f"Failed to upload to S3: {str(e)}")
+            raise
+    def delete(self, path: str) -> None:
+        try:
+            self.client.delete_object(Bucket=self.bucket, Key=path)
+        except Exception as e:
+            logger.error(f"Failed to delete from S3: {str(e)}")
+            raise
+def preprocess_image(image_data: bytes, max_dim: int = 600, quality: int = 60) -> bytes:
+    arr = np.frombuffer(image_data, np.uint8)
+    img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
+    if img is not None:
+        h, w, _ = img.shape
+        if max(h, w) > max_dim:
+            scale = max_dim / float(max(h, w))
+            new_w = int(w * scale)
+            new_h = int(h * scale)
+            img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
+        encode_params = [int(cv2.IMWRITE_JPEG_QUALITY), quality]
+        success, enc = cv2.imencode(".jpg", img, encode_params)
+        if success:
+            return enc.tobytes()
+    return image_data
+def call_gemini_for_table_classification(image_data: bytes, api_key: str, max_retries: int = 1) -> str:
+    """
+    Existing Gemini call to classify an image as TWO_COLUMN, THREE_COLUMN, or NO_TABLE.
+    """
+    for attempt in range(max_retries + 1):
+        try:
+            prompt = """You are given an image. Determine if it shows a table that has exactly 2 or 3 columns.
+The three-column 'table' image includes such key features:
+    - Three columns header
+    - Headers like 'Topics', 'Content', 'Guidelines', 'Amplification', 'Additional guidance notes', 'Area of Study'
+    - Possibly sections (e.g. 8.4, 9.1)
+The two-column 'table' image includes such key features:
+    - Two columns
+    - Headers like 'Subject content', 'Additional information'
+    - Possibly sections (e.g. 2.1, 3.4, G2, G3, )
+If the image is a relevant table with 2 columns, respond with 'TWO_COLUMN'.
+If the image is a relevant table with 3 columns, respond with 'THREE_COLUMN'.
+If the image is non-empty but does not show a table, respond with 'NO_TABLE'.
+Return only one of these exact labels.
+"""
+            global _GEMINI_CLIENT
+            if _GEMINI_CLIENT is None:
+                _GEMINI_CLIENT = genai.Client(api_key=api_key)
+            client = _GEMINI_CLIENT
+            resp = client.models.generate_content(
+                model="gemini-2.0-flash",
+                contents=[
+                    {
+                        "parts": [
+                            {"text": prompt},
+                            {
+                                "inline_data": {
+                                    "mime_type": "image/jpeg",
+                                    "data": base64.b64encode(image_data).decode('utf-8')
+                                }
+                            }
+                        ]
+                    }
+                ],
+                config=types.GenerateContentConfig(temperature=0.0)
+            )
+            if resp and resp.text:
+                classification = resp.text.strip().upper()
+                if "THREE" in classification:
+                    return "THREE_COLUMN"
+                elif "TWO" in classification:
+                    return "TWO_COLUMN"
+                elif "EMPTY" in classification:
+                    return "EMPTY_IMAGE"
+            return "NO_TABLE"
+        except Exception as e:
+            logger.error(f"Gemini table classification error: {e}")
+            if "503" in str(e):
+                return "NO_TABLE"
+            if attempt < max_retries:
+                time.sleep(0.5)
+            else:
+                return "NO_TABLE"
+async def classify_image_async(image_data: bytes, api_key: str, max_retries: int = 1) -> str:
+    loop = asyncio.get_event_loop()
+    preprocessed = preprocess_image(image_data)
+    return await loop.run_in_executor(None, call_gemini_for_table_classification, preprocessed, api_key, max_retries)
+def call_gemini_for_subtopic_identification_image(image_data: bytes, api_key: str, max_retries: int = 1) -> dict:
+    for attempt in range(max_retries + 1):
+        try:
+            prompt = """
+You are given an image from an educational curriculum specification for Gemini Flash 2. The image may contain:
+1) A main topic heading in the format: "<number> <Topic Name>", for example "2 Algebra and functions continued".
+2) A subtopic heading in the format "<number>.<number>" or "<number>.<number>.<number>", for example "2.5", "2.6", "3.4", "2.1.1", "4.3.3" or "1.2.1".
+3) A label-like title in the left column of a two-column table, for example "G2", "G3", "Scarcity, choice and opportunity cost", or similar text without explicit numeric patterns (2.1, 3.4, etc.).
+4) Possibly no relevant text or only truncated text (e.g. "Topics", "Subject content", "What students need to learn", "Content Amplification Additional guidance notes", etc.).
+Your task is to extract:
+- **"title"**: A recognized main topic or heading text.
+- **"subtopics"**: Any recognized subtopic numbers (e.g. "2.5", "2.6", "3.4", "G2", "2.1.1", "4.1.1"), as an array of strings.
+Follow these rules:
+(1) **If the cell shows a main topic in the format "<number> <Topic Name>",** for example "2 Algebra and functions continued":
+    - Remove the word "continued" if present.
+    - Put that resulting text in "title". (e.g. "2 Algebra and functions")
+    - "subtopics" should be an empty array, unless smaller subtopic numbers (e.g. "2.5") are also detected in the same text.
+(2) **If the cell shows one or more subtopic numbers** in the format "<number>.<number>", for example "2.5", "2.6", or "3.4":
+    - Collect those exact strings in the JSON key "subtopics" (an array of strings).
+    - "title" in this case should be an empty string if you only detect subtopics.
+      (Example: If text is "2.5 Solve linear inequalities...", then "title" = "", "subtopics" = ["2.5"]).
+(3) **If no main topic or subtopic is detected but the text appears to be a heading**, for example "Specialisation, division of labour and exchange", then:
+    - Return:
+      {
+        "title": "<the heading text>",
+        "subtopics": []
+      }
+(4) **If there is no numeric value in the left column** (e.g. "2.1" or "2 <Topic name>" not found) but the left column text appears to be a heading (for instance "Scarcity, choice and opportunity cost"), then:
+    - Use that left column text as "title".
+    - "subtopics" remains empty.
+    Example:
+    If the left column is "Scarcity, choice and opportunity cost" and the right column has definitions, your output is:
+    {
+      "title": "Scarcity, choice and opportunity cost",
+      "subtopics": []
+    }
+(5) **If there is no numeric value in the left column** (e.g. "2.1" or "2 <Topic name>" not found) or it appears to be a standalone column with text, treat it as a heading.
+    - "subtopics" remains empty.
+    Example:
+    If there is only one column image that is "Specialisation, devision of labour and exchange" and the right column is not present, your output is:
+    {
+      "title": "Specialisation, devision of labour and exchange",
+      "subtopics": []
+    }
+(6) **If there is a character + digit pattern** in the left column of a two-column table (for example "G2", "G3", "G4", "C1"), treat that as a topic-like label:
+    - Put that label text into "title" (e.g. "G2").
+    - "subtopics" remains empty unless you also see actual subtopic formats like "2.5", "3.4" inside the same cell.
+(7) **Output must be valid JSON** in this exact structure, with no extra text or explanation:
+    {
+      "title": "...",
+      "subtopics": [...]
+    }
+(8) **If the image is blank or truncated**, defined as:
+    - Contains no words at all (e.g. a blank white or black image), **OR**
+    - Contains only snippet words/phrases such as "Topics", "Subject content", "Content Amplification Additional guidance notes", "What students need to learn" (including variations in background color), **OR**
+    - Contains partial headings with no recognizable numeric or textual headings
+    - Contains partial UI labels only, such as “Topics” in a gray bar or “What students need to learn” in a blue bar, with no additional meaningful text.
+    then return:
+    {
+      "title": "EMPTY_IMAGE",
+      "subtopics": []
+    }
+(9) **If you cannot recognize any text matching the patterns above**, or the text is too partial/truncated to form a valid heading, also return:
+    {
+      "title": "EMPTY_IMAGE",
+      "subtopics": []
+    }
+**Examples**:
+- If the image text is "2 Algebra and functions continued", return:
+  {
+    "title": "2 Algebra and functions",
+    "subtopics": []
+  }
+- If the image text is "2.5 Solve linear and quadratic inequalities ...", return:
+  {
+    "title": "",
+    "subtopics": ["2.5"]
+  }
+- If the image text is "Specialisation, division of labour and exchange" (with no numeric patterns at all), return:
+  {
+    "title": "Specialisation, division of labour and exchange",
+    "subtopics": []
+  }
+- If the left column says "G2" and the right column has details, but no subtopic numbers, return:
+  {
+    "title": "G2",
+    "subtopics": []
+  }
+- If the image is blank or shows only partial/truncated snippet words (e.g. "Topics", "Content Amplification Additional guidance notes", "Subject content", "What students need to learn") and nothing else, return:
+  {
+    "title": "EMPTY_IMAGE",
+    "subtopics": []
+  }
+"""
+            global _GEMINI_CLIENT
+            if _GEMINI_CLIENT is None:
+                _GEMINI_CLIENT = genai.Client(api_key=api_key)
+            client = _GEMINI_CLIENT
+            resp = client.models.generate_content(
+                model="gemini-2.0-flash",
+                contents=[
+                    {
+                        "parts": [
+                            {"text": prompt},
+                            {
+                                "inline_data": {
+                                    "mime_type": "image/jpeg",
+                                    "data": base64.b64encode(image_data).decode("utf-8")
+                                }
+                            }
+                        ]
+                    }
+                ],
+                config=types.GenerateContentConfig(temperature=0.0)
+            )
+            if not resp or not resp.text:
+                logger.warning("Gemini returned an empty response for subtopic extraction.")
+                return {"title": "", "subtopics": []}
+            raw = resp.text.strip()
+            # Remove any markdown fences if present
+            raw = raw.replace("```json", "").replace("```", "").strip()
+            data = json.loads(raw)
+            title = data.get("title", "")
+            subtopics = data.get("subtopics", [])
+            if title.upper() == "EMPTY_IMAGE":
+                return {"title": "EMPTY_IMAGE", "subtopics": []}
+            if not isinstance(subtopics, list):
+                subtopics = []
+            return {"title": title, "subtopics": subtopics}
+        except Exception as e:
+            logger.error(f"Gemini subtopic identification error on attempt {attempt}: {e}")
+            if attempt < max_retries:
+                time.sleep(0.5)
+            else:
+                return {"title": "", "subtopics": []}
+    return {"title": "", "subtopics": []}
+class S3ImageWriter(DataWriter):
+    def __init__(self, s3_writer: s3Writer, base_path: str, gemini_api_key: str):
+        self.s3_writer = s3_writer
+        self.base_path = base_path if base_path.endswith("/") else base_path + "/"
+        self.gemini_api_key = gemini_api_key
+        self.descriptions = {}
+        self._img_count = 0
+        self.extracted_tables = {}
+        self.extracted_subtopics = {}
+    def write(self, path: str, data: bytes) -> None:
+        self._img_count += 1
+        unique_id = f"img_{self._img_count}.jpg"
+        s3_key = f"{self.base_path}{unique_id}"
+        self.s3_writer.write(s3_key, data)
+        self.descriptions[path] = {
+            "data": data,
+            "s3_path": s3_key,
+            "table_classification": "NO_TABLE",
+            "final_alt": ""
+        }
+    async def post_process_async(self, key: str, md_content: str) -> str:
+        logger.info("Classifying images to detect tables.")
+        tasks = {
+            p: asyncio.create_task(classify_image_async(info["data"], self.gemini_api_key))
+            for p, info in self.descriptions.items()
+        }
+        results = await asyncio.gather(*tasks.values(), return_exceptions=True)
+        for p, result in zip(list(self.descriptions.keys()), results):
+            if isinstance(result, Exception):
+                logger.error(f"Table classification error for {p}: {result}")
+                self.descriptions[p]['table_classification'] = "NO_TABLE"
+            else:
+                self.descriptions[p]['table_classification'] = result
+        # Process each image description.
+        for p, info in list(self.descriptions.items()):
+            cls = info['table_classification']
+            if cls == "TWO_COLUMN":
+                info['final_alt'] = "HAS TO BE PROCESSED - two column table"
+            elif cls == "THREE_COLUMN":
+                info['final_alt'] = "HAS TO BE PROCESSED - three column table"
+            elif cls == "EMPTY_IMAGE":
+                md_content = md_content.replace(f"![]({key}{p})", "")
+                try:
+                    self.s3_writer.delete(info['s3_path'])
+                except Exception as e:
+                    logger.error(f"Error deleting S3 object {info['s3_path']}: {e}")
+                del self.descriptions[p]
+                continue
+            else:
+                info['final_alt'] = "NO_TABLE image"
+            md_content = md_content.replace(f"![]({key}{p})", f"![{info['final_alt']}]({info['s3_path']})")
+        md_content = await self._process_table_images_in_markdown(key, md_content)
+        # Filter final lines to keep only lines with images.
+        final_lines = [
+            line.strip() for line in md_content.split("\n")
+            if re.match(r"^\!\[.*\]\(.*\)", line.strip())
+        ]
+        return "\n".join(final_lines)
+    async def _process_table_images_in_markdown(self, key: str, md_content: str) -> str:
+        pat = r"!\[HAS TO BE PROCESSED - (two|three) column table\]\(([^)]+)\)"
+        matches = re.findall(pat, md_content, flags=re.IGNORECASE)
+        if not matches:
+            return md_content
+        for (col_type, s3_key) in matches:
+            logger.info(f"Processing table image: {s3_key}, columns={col_type}")
+            img_data = None
+            for desc in self.descriptions.values():
+                if desc.get("s3_path") == s3_key:
+                    img_data = desc.get("data")
+                    break
+            if img_data is None:
+                logger.warning(f"No image data found for S3 key {s3_key}. Skipping.")
+                continue
+            # Write temporary file for processing.
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file:
+                temp_file.write(img_data)
+                temp_path = temp_file.name
+            try:
+                if col_type.lower() == 'two':
+                    extractor = TableExtractor(
+                        skip_header=True,
+                        merge_two_col_rows=True,
+                        enable_subtopic_merge=True,
+                        subtopic_threshold=0.2
+                    )
+                else:
+                    extractor = TableExtractor(
+                        skip_header=True,
+                        merge_two_col_rows=False,
+                        enable_subtopic_merge=False,
+                        subtopic_threshold=0.2
+                    )
+                row_boxes = extractor.process_image(temp_path)
+                out_folder = temp_path + "_rows"
+                os.makedirs(out_folder, exist_ok=True)
+                extractor.save_extracted_cells(temp_path, row_boxes, out_folder)
+                #Group cells by row using file name pattern
+                recognized_main_topic = ""
+                main_topic_image_key = None
+                recognized_subtopics = []
+                header_found = False
+                header_row_index = None
+                # Loop through each row of extracted cells
+                for i, row in enumerate(row_boxes):
+                    row_dir = os.path.join(out_folder, f"row_{i}")
+                    valid_info = None
+                    valid_cell_key = None
+                    for j in range(len(row)):
+                        cell_path = os.path.join(row_dir, f"col_{j}.png")
+                        if not os.path.isfile(cell_path):
+                            alternative_path = os.path.join(row_dir, f"col_{j}.jpg")
+                            if os.path.isfile(alternative_path):
+                                cell_path = alternative_path
+                            else:
+                                logger.warning(f"Cell image not found: {cell_path}")
+                                continue
+                        with open(cell_path, "rb") as cf:
+                            cell_image_data = cf.read()
+                        cell_key = f"{self.base_path}cells/{os.path.basename(s3_key)}_r{i}_c{j}.png"
+                        self.s3_writer.write(cell_key, cell_image_data)
+                        info = call_gemini_for_subtopic_identification_image(cell_image_data, self.gemini_api_key)
+                        if info.get("title", "").upper() == "EMPTY_IMAGE":
+                            try:
+                                self.s3_writer.delete(cell_key)
+                                logger.info(f"Deleted empty cell image from S3: {cell_key}")
+                            except Exception as e:
+                                logger.error(f"Error deleting empty cell image {cell_key}: {e}")
+                            continue
+                        valid_info = info
+                        valid_cell_key = cell_key
+                        break  # Use only the first valid cell in this row
+                    if valid_info is None:
+                        continue
+                    # First valid row becomes header row.
+                    if not header_found:
+                        header_found = True
+                        header_row_index = i
+                        recognized_main_topic = valid_info.get("title", "")
+                        main_topic_image_key = valid_cell_key
+                    # The row immediately following the header is used for subtopic children.
+                    elif i == header_row_index + 1:
+                        for st in valid_info.get("subtopics", []):
+                            recognized_subtopics.append({
+                                "title": st,
+                                "contents": [{"type": "image", "key": valid_cell_key}],
+                                "children": []
+                            })
+                    else:
+                        # Ignore further rows
+                        continue
+                final_json = {
+                    "title": recognized_main_topic,
+                    "contents": [],
+                    "children": recognized_subtopics
+                }
+                if main_topic_image_key:
+                    final_json["contents"].append({"type": "image", "key": main_topic_image_key})
+                # Save the final JSON.
+                self.extracted_subtopics[s3_key] = final_json
+                # Create a snippet to replace the markdown line.
+                snippet = ["**Extracted table cells:**"]
+                if main_topic_image_key:
+                    snippet.append(f"![Header]({main_topic_image_key})")
+                for child in recognized_subtopics:
+                    for content in child.get("contents", []):
+                        snippet.append(f"![Child]({content.get('key')})")
+                new_snip = "\n".join(snippet)
+                old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({s3_key})"
+                md_content = md_content.replace(old_line, new_snip)
+            except Exception as e:
+                logger.error(f"Error processing table image {s3_key}: {e}")
+            finally:
+                os.remove(temp_path)
+        return md_content
+    def post_process(self, key: str, md_content: str) -> str:
+        return asyncio.run(self.post_process_async(key, md_content))
+class GeminiTopicExtractor:
+    def __init__(self, api_key: str = None, num_pages: int = 14):
+        self.api_key = api_key or os.getenv("GEMINI_API_KEY", "")
+        self.num_pages = num_pages
+    def extract_subtopics(self, pdf_path: str) -> Dict[str, List[int]]:
+        first_pages_text = self._read_first_pages_raw(pdf_path, self.num_pages)
+        if not first_pages_text.strip():
+            logger.error("No text from first pages => cannot extract subtopics.")
+            return {}
+        prompt = f"""
+You have the first pages of a PDF specification, including a table of contents.
+Instructions:
+1. Identify the 'Contents' section listing all topics, subtopics, and their corresponding pages.
+2. Identify the major academic subtopics (common desired topic names "Paper X", "Theme X", "Content of X", "AS Unit X", "A2 Unit X", or similar headings).
+3. For each subtopic, give the range of pages [start_page, end_page] (1-based) from the table of contents.
+4. Output only valid JSON of the form:
+    {{
+    "Subtopic A": [start_page, end_page],
+    "Subtopic B": [start_page, end_page]
+    }}
+5. If you can't find any subtopics, return an empty JSON.
+Important notes:
+- The correct "end_page" must be the page number of the next topic or subtopic minus 1.
+- The final output must be valid JSON only, with no extra text or code blocks.
+Examples:
+1. Given this table of contents:
+1 Introduction – 2
+    Why choose Edexcel A Level Mathematics? - 2
+    Supporting you in planning and implementing this qualification - 3
+    Qualification at a glance - 5
+2 Subject content and assessment information – 7
+    Paper 1 and Paper 2: Pure Mathematics - 11
+    Paper 3: Statistics and Mechanics - 30
+    Assessment Objectives - 40
+3 Administration and general information – 42
+    Entries - 42
+    Access arrangements, reasonable adjustments, special consideration and malpractice - 42
+    Student recruitment and progression - 45
+Appendix 1: Formulae – 49
+Appendix 2: Notation – 53
+Appendix 3: Use of calculators – 59
+Appendix 4: Assessment Objectives – 60
+Appendix 5: The context for the development of this qualification – 62
+Appendix 6: Transferable skills – 64
+Appendix 7: Level 3 Extended Project qualification – 65
+Appendix 8: Codes – 67
+The correct output should be:
+{{
+    "Paper 1 and Paper 2: Pure Mathematics": [11, 29],
+    "Paper 3: Statistics and Mechanics": [30, 42]
+}}
+2. Given this table of contents:
+Qualification at a glance – 1
+    Assessment Objectives and weightings - 4
+Knowledge, skills and understanding – 5
+    Theme 1: Introduction to markets and market failure - 5
+    Theme 2: The UK economy – performance and policies - 11
+    Theme 3: Business behaviour and the labour market - 21
+    Theme 4: A global perspective - 29
+Assessment – 39
+    Assessment summary - 39
+    Assessment objectives - 41
+    Assessment overview - 42
+    Breakdown of assessment objectives - 42
+        Synoptic assessment - 43
+        Discount code and performance tables - 43
+        Access arrangements, reasonable adjustments and special consideration - 44
+        Malpractice - 45
+        Equality Act 2010 and Pearson equality policy - 45
+        Synoptic assessment - 46
+        Awarding and reporting - 47
+Other information – 49
+    Student recruitment -49
+    Prior learning and other requirements -49
+    Progression - 49
+Appendix 1: Transferable skills – 53
+Appendix 2: Level 3 Extended Project qualification – 55
+Appendix 3: Quantitative skills – 59
+Appendix 4: Codes – 61
+Appendix 5: Index – 63
+The correct output should be:
+{{
+    "Theme 1: Introduction to markets and market failure": [5, 10],
+    "Theme 2: The UK economy – performance and policies": [11, 20],
+    "Theme 3: Business behaviour and the labour market": [21, 28],
+    "Theme 4: A global perspective": [29, 38]
+}}
+3. You might also see sections like:
+2.1 AS Unit 1 11
+2.2 AS Unit 2 18
+2.3 A2 Unit 3 24
+2.4 A2 Unit 4 31
+In that scenario, your output might look like:
+{{
+    "2.1 AS Unit 1": [11, 17],
+    "2.2 AS Unit 2": [18, 23],
+    "2.3 A2 Unit 3": [24, 30],
+    "2.4 A2 Unit 4": [31, 35]
+}}
+or
+2.1 AS units 6
+2.2 AS units 23
+In that scenario, your output might look like:
+{{
+    "2.1 AS Unit 1": [6, 2],
+    "2.2 AS Unit 2": [23, 43]
+}}
+4. Another example might list subtopics:
+3.1 Overarching themes 11
+3.2 A: Proof 12
+3.3 B: Algebra and functions 13
+3.4 C: Coordinate geometry in the ( x , y ) plane 14
+3.5 D: Sequences and series 15
+3.6 E: Trigonometry 16
+3.7 F: Exponentials and logarithms 17
+3.8 G: Differentiation 18
+3.9 H: Integration 19
+3.10 I: Numerical methods 20
+3.11 J: Vectors 20
+3.12 K: Statistical sampling 21
+3.13 L: Data presentation and interpretation 21
+3.14 M: Probability 22
+3.15 N: Statistical distributions 23
+3.16 O: Statistical hypothesis testing 23
+3.17 P: Quantities and units in mechanics 24
+3.18 Q: Kinematics 24
+3.19 R: Forces and Newton’s laws 24
+3.20 S: Moments 25
+3.21 Use of data in statistics 26
+Here the correct output might look like:
+{{
+    "A: Proof": [12, 12],
+    "B: Algebra and functions": [13, 13],
+    ...
+}}
+Now, extract topics from this text:
+{first_pages_text}
+"""
+        global _GEMINI_CLIENT
+        if _GEMINI_CLIENT is None:
+            _GEMINI_CLIENT = genai.Client(api_key=self.api_key)
+        client = _GEMINI_CLIENT
+        try:
+            response = client.models.generate_content(
+                model="gemini-2.0-flash",
+                contents=[prompt],
+                config=types.GenerateContentConfig(temperature=0.0)
+            )
+            if not response or not response.text:
+                logger.warning("No text from LLM => returning empty subtopics.")
+                return {}
+            raw_json = response.text.strip()
+            cleaned = raw_json.replace("```json", "").replace("```", "")
+            try:
+                data = json.loads(cleaned)
+            except Exception as json_err:
+                logger.error(f"JSON parsing error: {json_err}")
+                return {}
+            final_dict = {}
+            found_sub_dict = None
+            for k, v in data.items():
+                if isinstance(v, dict):
+                    found_sub_dict = v
+                    break
+            if found_sub_dict is not None:
+                for subk, rng in found_sub_dict.items():
+                    if isinstance(rng, list) and len(rng) == 2:
+                        final_dict[subk] = rng
+            else:
+                for subk, rng in data.items():
+                    if isinstance(rng, list) and len(rng) == 2:
+                        final_dict[subk] = rng
+            return final_dict
+        except Exception as e:
+            logger.error(f"Gemini subtopic extraction error: {e}")
+            return {}
+    def _read_first_pages_raw(self, pdf_path: str, num_pages: int) -> str:
+        text_parts = []
+        try:
+            if pdf_path.startswith("http://") or pdf_path.startswith("https://"):
+                response = requests.get(pdf_path)
+                if response.status_code != 200:
+                    logger.error("Failed to download PDF from %s. Status code: %d", pdf_path, response.status_code)
+                    return ""
+                pdf_bytes = response.content
+            else:
+                with open(pdf_path, "rb") as f:
+                    pdf_bytes = f.read()
+            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+            pages_to_read = min(num_pages, doc.page_count)
+            for i in range(pages_to_read):
+                raw_text = doc[i].get_text("raw")
+                text_parts.append(raw_text)
+            doc.close()
+        except Exception as e:
+            logger.error(f"Could not open PDF: {e}")
+        return "\n".join(text_parts)
+class MineruNoTextProcessor:
+    def __init__(self, output_folder: str, gemini_api_key: str):
+        self.output_folder = output_folder
+        os.makedirs(self.output_folder, exist_ok=True)
+        self.layout_model = "doclayout_yolo"
+        self.formula_enable = True
+        self.table_enable = False
+        self.language = "en"
+        self.subtopic_extractor = GeminiTopicExtractor(api_key=gemini_api_key, num_pages=20)
+        self.gemini_api_key = gemini_api_key or os.getenv("GEMINI_API_KEY", "")
+        self.use_s3 = True
+        self.s3_writer = s3Writer(
+            ak=os.getenv("S3_ACCESS_KEY"),
+            sk=os.getenv("S3_SECRET_KEY"),
+            bucket="quextro-resources",
+            endpoint_url=os.getenv("S3_ENDPOINT")
+        )
+    def cleanup_gpu(self):
+        try:
+            gc.collect()
+            torch.cuda.empty_cache()
+            logger.info("GPU memory cleaned up.")
+        except Exception as e:
+            logger.error(f"Error during GPU cleanup: {e}")
+    def process(self, pdf_path: str) -> Dict[str, Any]:
+        logger.info(f"Processing PDF: {pdf_path}")
+        try:
+            subtopics = self.subtopic_extractor.extract_subtopics(pdf_path)
+            logger.info(f"Gemini returned subtopics: {subtopics}")
+            if pdf_path.startswith("http://") or pdf_path.startswith("https://"):
+                response = requests.get(pdf_path)
+                if response.status_code != 200:
+                    logger.error("Failed to download PDF from %s. Status code: %d", pdf_path, response.status_code)
+                    raise Exception(f"Failed to download PDF: {pdf_path}")
+                pdf_bytes = response.content
+                logger.info("Downloaded %d bytes for pdf_url='%s'", len(pdf_bytes), pdf_path)
+            else:
+                with open(pdf_path, "rb") as f:
+                    pdf_bytes = f.read()
+                logger.info("Loaded %d bytes from local file '%s'", len(pdf_bytes), pdf_path)
+            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+            total_pages = doc.page_count
+            doc.close()
+            # Decide which pages to process
+            final_pages = set()
+            if not subtopics:
+                # fallback
+                final_pages = set(range(total_pages))
+            else:
+                offset_candidates = []
+                for subname, rng in subtopics.items():
+                    start_p, _ = rng
+                    occs = find_all_occurrences(pdf_bytes, subname)
+                    for p in occs:
+                        candidate = p - (start_p - 1)
+                        if candidate > 0:
+                            offset_candidates.append(candidate)
+                if offset_candidates:
+                    try:
+                        from statistics import mode
+                        global_offset = mode(offset_candidates)
+                    except:
+                        from statistics import median
+                        global_offset = int(median(offset_candidates))
+                else:
+                    global_offset = 0
+                logger.info(f"Computed global offset: {global_offset}")
+                for subname, rng in subtopics.items():
+                    if not (isinstance(rng, list) and len(rng) == 2):
+                        continue
+                    start_p, end_p = rng
+                    if start_p > end_p:
+                        continue
+                    s0 = (start_p - 1) + global_offset
+                    e0 = (end_p - 1) + global_offset
+                    for pp in range(s0, e0 + 1):
+                        final_pages.add(pp)
+            if not final_pages:
+                final_pages = set(range(total_pages))
+            logger.info(f"Processing pages (0-based): {sorted(final_pages)}")
+            subset_pdf_bytes = create_subset_pdf(pdf_bytes, sorted(final_pages))
+            # 4) Analyze and produce markdown
+            dataset = PymuDocDataset(subset_pdf_bytes)
+            inference = doc_analyze(
+                dataset,
+                ocr=True,
+                lang=self.language,
+                layout_model=self.layout_model,
+                formula_enable=self.formula_enable,
+                table_enable=self.table_enable
+            )
+            # S3
+            writer = S3ImageWriter(self.s3_writer, "/topic-extraction", self.gemini_api_key)
+            md_prefix = "/topic-extraction/"
+            pipe_result = inference.pipe_ocr_mode(writer, lang=self.language)
+            md_content = pipe_result.get_markdown(md_prefix)
+            final_markdown = writer.post_process(md_prefix, md_content)
+            subtopic_list = list(writer.extracted_subtopics.values())
+            subtopic_list = merge_topics(subtopic_list)
+            out_path = os.path.join(self.output_folder, "_subtopics.json")
+            with open(out_path, "w", encoding="utf-8") as f:
+                json.dump(subtopic_list, f, indent=2)
+            logger.info(f"Final subtopics JSON saved locally at {out_path}")
+            return {
+                "final_markdown": final_markdown,
+                "subtopics_extracted": subtopic_list
+            }
+        finally:
+            self.cleanup_gpu()
+if __name__ == "__main__":
+    input_pdf = "/home/user/app/input_output/wjec-gce-as-a-economics-specification-from-2015.pdf"
+    output_dir = "/home/user/app/pearson_json"
+    gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
+    try:
+        processor = MineruNoTextProcessor(output_folder=output_dir, gemini_api_key=gemini_key)
+        result = processor.process(input_pdf)
+        logger.info("Processing completed successfully.")
+    except Exception as e:
+        logger.error(f"Processing failed: {e}")

topic_extraction.log CHANGED Viewed

@@ -7483,3 +7483,237 @@ and series'. Using page 7.
 2025-03-04 17:29:32,884 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_29.jpg
 2025-03-04 17:29:33,308 [INFO] __main__ - Classifying images to detect tables.
 2025-03-04 17:59:52,883 [INFO] __main__ - GPU memory cleaned up.

 2025-03-04 17:29:32,884 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_29.jpg
 2025-03-04 17:29:33,308 [INFO] __main__ - Classifying images to detect tables.
 2025-03-04 17:59:52,883 [INFO] __main__ - GPU memory cleaned up.
+2025-03-04 18:24:55,659 [INFO] __main__ - Processing PDF: /home/user/app/input_output/wjec-gce-as-a-economics-specification-from-2015.pdf
+2025-03-04 18:24:56,486 [INFO] __main__ - Gemini returned subtopics: {'2.1AS units': [7, 22], '2.2A2 units': [23, 43]}
+2025-03-04 18:24:56,487 [INFO] __main__ - Loaded 3543551 bytes from local file '/home/user/app/input_output/wjec-gce-as-a-economics-specification-from-2015.pdf'
+2025-03-04 18:24:56,724 [INFO] __main__ - Computed global offset: 0
+2025-03-04 18:24:56,725 [INFO] __main__ - Processing pages (0-based): [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42]
+2025-03-04 18:26:37,627 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_1.jpg
+2025-03-04 18:26:38,287 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_2.jpg
+2025-03-04 18:26:38,720 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_3.jpg
+2025-03-04 18:26:39,215 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_4.jpg
+2025-03-04 18:26:39,531 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_5.jpg
+2025-03-04 18:26:39,917 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_6.jpg
+2025-03-04 18:26:40,490 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_7.jpg
+2025-03-04 18:26:40,968 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_8.jpg
+2025-03-04 18:26:41,372 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_9.jpg
+2025-03-04 18:26:41,675 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_10.jpg
+2025-03-04 18:26:42,251 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_11.jpg
+2025-03-04 18:26:42,757 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_12.jpg
+2025-03-04 18:26:43,326 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_13.jpg
+2025-03-04 18:26:43,626 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_14.jpg
+2025-03-04 18:26:44,254 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_15.jpg
+2025-03-04 18:26:44,797 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_16.jpg
+2025-03-04 18:26:45,300 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_17.jpg
+2025-03-04 18:26:45,689 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_18.jpg
+2025-03-04 18:26:46,237 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_19.jpg
+2025-03-04 18:26:46,642 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_20.jpg
+2025-03-04 18:26:47,162 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_21.jpg
+2025-03-04 18:26:47,668 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_22.jpg
+2025-03-04 18:26:48,043 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_23.jpg
+2025-03-04 18:26:48,639 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_24.jpg
+2025-03-04 18:26:49,154 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_25.jpg
+2025-03-04 18:26:49,534 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_26.jpg
+2025-03-04 18:26:50,096 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_27.jpg
+2025-03-04 18:26:50,670 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_28.jpg
+2025-03-04 18:26:51,044 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_29.jpg
+2025-03-04 18:26:51,475 [INFO] __main__ - Classifying images to detect tables.
+2025-03-04 18:26:56,074 [INFO] __main__ - Processing table image: /topic-extraction/img_1.jpg, columns=three
+2025-03-04 18:26:59,389 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r0_c0.png
+2025-03-04 18:27:00,348 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_1.jpg_r0_c0.png
+2025-03-04 18:27:00,601 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r1_c0.png
+2025-03-04 18:27:10,689 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r2_c0.png
+2025-03-04 18:27:11,820 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r3_c0.png
+2025-03-04 18:27:12,855 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_1.jpg_r4_c0.png
+2025-03-04 18:27:13,889 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_1.jpg_r4_c0.png
+2025-03-04 18:27:13,890 [INFO] __main__ - Processing table image: /topic-extraction/img_2.jpg, columns=three
+2025-03-04 18:27:17,341 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r0_c0.png
+2025-03-04 18:27:18,536 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r1_c0.png
+2025-03-04 18:27:19,842 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r2_c0.png
+2025-03-04 18:27:20,887 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_2.jpg_r3_c0.png
+2025-03-04 18:27:22,626 [WARNING] __main__ - Cell image not found: /tmp/tmpns_p2pw7.jpg_rows/row_4/col_0.png
+2025-03-04 18:27:22,626 [INFO] __main__ - Processing table image: /topic-extraction/img_3.jpg, columns=three
+2025-03-04 18:27:24,756 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_3.jpg_r0_c0.png
+2025-03-04 18:27:25,630 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_3.jpg_r0_c0.png
+2025-03-04 18:27:25,976 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_3.jpg_r1_c0.png
+2025-03-04 18:27:26,909 [WARNING] __main__ - Cell image not found: /tmp/tmpmkqp5iik.jpg_rows/row_2/col_0.png
+2025-03-04 18:27:26,910 [INFO] __main__ - Processing table image: /topic-extraction/img_4.jpg, columns=three
+2025-03-04 18:27:29,569 [WARNING] __main__ - Cell image not found: /tmp/tmpnakrpg49.jpg_rows/row_0/col_0.png
+2025-03-04 18:27:29,569 [WARNING] __main__ - Cell image not found: /tmp/tmpnakrpg49.jpg_rows/row_0/col_1.png
+2025-03-04 18:27:29,835 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r1_c0.png
+2025-03-04 18:27:30,823 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_4.jpg_r1_c0.png
+2025-03-04 18:27:30,823 [WARNING] __main__ - Cell image not found: /tmp/tmpnakrpg49.jpg_rows/row_1/col_1.png
+2025-03-04 18:27:31,085 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r2_c0.png
+2025-03-04 18:27:33,674 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r3_c0.png
+2025-03-04 18:27:34,672 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_4.jpg_r4_c0.png
+2025-03-04 18:27:35,592 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_4.jpg_r4_c0.png
+2025-03-04 18:27:35,593 [INFO] __main__ - Processing table image: /topic-extraction/img_5.jpg, columns=three
+2025-03-04 18:27:36,679 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_5.jpg_r0_c0.png
+2025-03-04 18:27:37,655 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_5.jpg_r0_c0.png
+2025-03-04 18:27:37,997 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_5.jpg_r1_c0.png
+2025-03-04 18:27:38,787 [WARNING] __main__ - Cell image not found: /tmp/tmp59baffv6.jpg_rows/row_2/col_0.png
+2025-03-04 18:27:38,787 [INFO] __main__ - Processing table image: /topic-extraction/img_6.jpg, columns=three
+2025-03-04 18:27:40,808 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_6.jpg_r0_c0.png
+2025-03-04 18:27:41,806 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_6.jpg_r0_c0.png
+2025-03-04 18:27:42,094 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_6.jpg_r1_c0.png
+2025-03-04 18:27:43,132 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_6.jpg_r2_c0.png
+2025-03-04 18:27:44,097 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_6.jpg_r2_c0.png
+2025-03-04 18:27:44,097 [INFO] __main__ - Processing table image: /topic-extraction/img_7.jpg, columns=three
+2025-03-04 18:27:47,411 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r0_c0.png
+2025-03-04 18:27:48,353 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_7.jpg_r0_c0.png
+2025-03-04 18:27:48,705 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r1_c0.png
+2025-03-04 18:27:49,963 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r2_c0.png
+2025-03-04 18:27:50,936 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_7.jpg_r3_c0.png
+2025-03-04 18:27:52,024 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_7.jpg_r3_c0.png
+2025-03-04 18:27:52,025 [INFO] __main__ - Processing table image: /topic-extraction/img_8.jpg, columns=three
+2025-03-04 18:27:54,377 [WARNING] __main__ - Cell image not found: /tmp/tmpsppe7tt4.jpg_rows/row_0/col_0.png
+2025-03-04 18:27:54,378 [WARNING] __main__ - Cell image not found: /tmp/tmpsppe7tt4.jpg_rows/row_0/col_1.png
+2025-03-04 18:27:54,639 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r1_c0.png
+2025-03-04 18:27:55,574 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_8.jpg_r1_c0.png
+2025-03-04 18:27:55,856 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r2_c0.png
+2025-03-04 18:27:56,935 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r3_c0.png
+2025-03-04 18:27:57,936 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_8.jpg_r4_c0.png
+2025-03-04 18:27:58,830 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_8.jpg_r4_c0.png
+2025-03-04 18:27:58,830 [INFO] __main__ - Processing table image: /topic-extraction/img_9.jpg, columns=three
+2025-03-04 18:28:00,927 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r0_c0.png
+2025-03-04 18:28:01,839 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_9.jpg_r0_c0.png
+2025-03-04 18:28:02,124 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r1_c0.png
+2025-03-04 18:28:03,147 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r2_c0.png
+2025-03-04 18:28:04,318 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r3_c0.png
+2025-03-04 18:28:05,234 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_9.jpg_r4_c0.png
+2025-03-04 18:28:06,333 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_9.jpg_r4_c0.png
+2025-03-04 18:28:06,333 [INFO] __main__ - Processing table image: /topic-extraction/img_10.jpg, columns=three
+2025-03-04 18:28:07,300 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_10.jpg_r0_c0.png
+2025-03-04 18:28:08,246 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_10.jpg_r0_c0.png
+2025-03-04 18:28:08,508 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_10.jpg_r1_c0.png
+2025-03-04 18:28:09,569 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_10.jpg_r2_c0.png
+2025-03-04 18:28:10,602 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_10.jpg_r2_c0.png
+2025-03-04 18:28:10,603 [INFO] __main__ - Processing table image: /topic-extraction/img_11.jpg, columns=three
+2025-03-04 18:28:13,214 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_11.jpg_r0_c0.png
+2025-03-04 18:28:14,131 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_11.jpg_r0_c0.png
+2025-03-04 18:28:14,477 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_11.jpg_r1_c0.png
+2025-03-04 18:28:15,765 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_11.jpg_r2_c0.png
+2025-03-04 18:28:16,868 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_11.jpg_r2_c0.png
+2025-03-04 18:28:16,869 [INFO] __main__ - Processing table image: /topic-extraction/img_12.jpg, columns=three
+2025-03-04 18:28:19,488 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_12.jpg_r0_c0.png
+2025-03-04 18:28:20,477 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_12.jpg_r0_c0.png
+2025-03-04 18:28:20,850 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_12.jpg_r1_c0.png
+2025-03-04 18:28:21,976 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_12.jpg_r2_c0.png
+2025-03-04 18:28:22,922 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_12.jpg_r2_c0.png
+2025-03-04 18:28:22,923 [INFO] __main__ - Processing table image: /topic-extraction/img_13.jpg, columns=three
+2025-03-04 18:28:26,026 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r0_c0.png
+2025-03-04 18:28:26,939 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_13.jpg_r0_c0.png
+2025-03-04 18:28:27,213 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r0_c1.png
+2025-03-04 18:28:28,270 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_13.jpg_r0_c1.png
+2025-03-04 18:28:28,611 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r1_c0.png
+2025-03-04 18:28:29,683 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r2_c0.png
+2025-03-04 18:28:30,673 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_13.jpg_r2_c0.png
+2025-03-04 18:28:30,933 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r3_c0.png
+2025-03-04 18:28:31,996 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_13.jpg_r4_c0.png
+2025-03-04 18:28:32,949 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_13.jpg_r4_c0.png
+2025-03-04 18:28:32,950 [INFO] __main__ - Processing table image: /topic-extraction/img_14.jpg, columns=three
+2025-03-04 18:28:34,332 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_14.jpg_r0_c0.png
+2025-03-04 18:28:35,272 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_14.jpg_r0_c0.png
+2025-03-04 18:28:35,541 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_14.jpg_r1_c0.png
+2025-03-04 18:28:36,537 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_14.jpg_r2_c0.png
+2025-03-04 18:28:37,794 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_14.jpg_r2_c0.png
+2025-03-04 18:28:37,794 [INFO] __main__ - Processing table image: /topic-extraction/img_15.jpg, columns=three
+2025-03-04 18:28:43,119 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r0_c0.png
+2025-03-04 18:28:44,084 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_15.jpg_r0_c0.png
+2025-03-04 18:28:44,353 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r1_c0.png
+2025-03-04 18:28:45,692 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r2_c0.png
+2025-03-04 18:28:46,679 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r3_c0.png
+2025-03-04 18:28:47,545 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_15.jpg_r4_c0.png
+2025-03-04 18:28:48,749 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_15.jpg_r4_c0.png
+2025-03-04 18:28:48,749 [INFO] __main__ - Processing table image: /topic-extraction/img_16.jpg, columns=three
+2025-03-04 18:28:51,810 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r0_c0.png
+2025-03-04 18:28:52,802 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_16.jpg_r0_c0.png
+2025-03-04 18:28:53,064 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r1_c0.png
+2025-03-04 18:28:54,144 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r2_c0.png
+2025-03-04 18:28:55,133 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r3_c0.png
+2025-03-04 18:28:57,845 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r4_c0.png
+2025-03-04 18:28:58,855 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_16.jpg_r5_c0.png
+2025-03-04 18:28:59,722 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_16.jpg_r5_c0.png
+2025-03-04 18:28:59,722 [INFO] __main__ - Processing table image: /topic-extraction/img_17.jpg, columns=three
+2025-03-04 18:29:02,875 [WARNING] __main__ - Cell image not found: /tmp/tmp0emfx_zt.jpg_rows/row_0/col_0.png
+2025-03-04 18:29:03,148 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r1_c0.png
+2025-03-04 18:29:04,098 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_17.jpg_r1_c0.png
+2025-03-04 18:29:04,361 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r2_c0.png
+2025-03-04 18:29:05,885 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r3_c0.png
+2025-03-04 18:29:06,881 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_17.jpg_r4_c0.png
+2025-03-04 18:29:07,738 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_17.jpg_r4_c0.png
+2025-03-04 18:29:07,739 [INFO] __main__ - Processing table image: /topic-extraction/img_18.jpg, columns=three
+2025-03-04 18:29:09,552 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r0_c0.png
+2025-03-04 18:29:10,757 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r1_c0.png
+2025-03-04 18:29:11,784 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r2_c0.png
+2025-03-04 18:29:12,800 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_18.jpg_r3_c0.png
+2025-03-04 18:29:13,609 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_18.jpg_r3_c0.png
+2025-03-04 18:29:13,610 [INFO] __main__ - Processing table image: /topic-extraction/img_19.jpg, columns=three
+2025-03-04 18:29:16,305 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r0_c0.png
+2025-03-04 18:29:17,210 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_19.jpg_r0_c0.png
+2025-03-04 18:29:17,472 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r1_c0.png
+2025-03-04 18:29:18,587 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r2_c0.png
+2025-03-04 18:29:19,610 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_19.jpg_r3_c0.png
+2025-03-04 18:29:20,792 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_19.jpg_r3_c0.png
+2025-03-04 18:29:20,792 [INFO] __main__ - Processing table image: /topic-extraction/img_20.jpg, columns=three
+2025-03-04 18:29:22,579 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_20.jpg_r0_c0.png
+2025-03-04 18:29:23,599 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_20.jpg_r0_c0.png
+2025-03-04 18:29:23,861 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_20.jpg_r1_c0.png
+2025-03-04 18:29:24,796 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_20.jpg_r2_c0.png
+2025-03-04 18:29:25,612 [WARNING] __main__ - Cell image not found: /tmp/tmpmxenc_0d.jpg_rows/row_3/col_0.png
+2025-03-04 18:29:25,613 [INFO] __main__ - Processing table image: /topic-extraction/img_21.jpg, columns=three
+2025-03-04 18:29:28,446 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_21.jpg_r0_c0.png
+2025-03-04 18:29:29,404 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_21.jpg_r0_c0.png
+2025-03-04 18:29:29,814 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_21.jpg_r1_c0.png
+2025-03-04 18:29:30,864 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_21.jpg_r2_c0.png
+2025-03-04 18:29:31,899 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_21.jpg_r2_c0.png
+2025-03-04 18:29:31,899 [INFO] __main__ - Processing table image: /topic-extraction/img_22.jpg, columns=three
+2025-03-04 18:29:34,452 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_22.jpg_r0_c0.png
+2025-03-04 18:29:35,395 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_22.jpg_r0_c0.png
+2025-03-04 18:29:35,740 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_22.jpg_r1_c0.png
+2025-03-04 18:29:36,880 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_22.jpg_r2_c0.png
+2025-03-04 18:29:37,830 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_22.jpg_r2_c0.png
+2025-03-04 18:29:37,830 [INFO] __main__ - Processing table image: /topic-extraction/img_23.jpg, columns=three
+2025-03-04 18:29:39,773 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_23.jpg_r0_c0.png
+2025-03-04 18:29:40,725 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_23.jpg_r0_c0.png
+2025-03-04 18:29:40,986 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_23.jpg_r1_c0.png
+2025-03-04 18:29:41,800 [WARNING] __main__ - Cell image not found: /tmp/tmp1_2b4e5z.jpg_rows/row_2/col_0.png
+2025-03-04 18:29:41,800 [INFO] __main__ - Processing table image: /topic-extraction/img_24.jpg, columns=three
+2025-03-04 18:29:45,437 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_24.jpg_r0_c0.png
+2025-03-04 18:29:46,443 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_24.jpg_r0_c0.png
+2025-03-04 18:29:46,788 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_24.jpg_r1_c0.png
+2025-03-04 18:29:47,654 [WARNING] __main__ - Cell image not found: /tmp/tmpyd5fc1x8.jpg_rows/row_2/col_0.png
+2025-03-04 18:29:47,654 [INFO] __main__ - Processing table image: /topic-extraction/img_25.jpg, columns=three
+2025-03-04 18:29:49,997 [WARNING] __main__ - Cell image not found: /tmp/tmpje6qj8ty.jpg_rows/row_0/col_0.png
+2025-03-04 18:29:50,258 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_25.jpg_r1_c0.png
+2025-03-04 18:29:51,237 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_25.jpg_r1_c0.png
+2025-03-04 18:29:51,649 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_25.jpg_r2_c0.png
+2025-03-04 18:29:52,817 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_25.jpg_r3_c0.png
+2025-03-04 18:29:53,849 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_25.jpg_r3_c0.png
+2025-03-04 18:29:53,849 [INFO] __main__ - Processing table image: /topic-extraction/img_26.jpg, columns=three
+2025-03-04 18:29:55,903 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_26.jpg_r0_c0.png
+2025-03-04 18:29:56,784 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_26.jpg_r0_c0.png
+2025-03-04 18:29:57,121 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_26.jpg_r1_c0.png
+2025-03-04 18:29:58,092 [WARNING] __main__ - Cell image not found: /tmp/tmple_xivqw.jpg_rows/row_2/col_0.png
+2025-03-04 18:29:58,092 [INFO] __main__ - Processing table image: /topic-extraction/img_27.jpg, columns=three
+2025-03-04 18:30:01,339 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r0_c0.png
+2025-03-04 18:30:02,324 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_27.jpg_r0_c0.png
+2025-03-04 18:30:02,680 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r1_c0.png
+2025-03-04 18:30:03,795 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r2_c0.png
+2025-03-04 18:30:04,805 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_27.jpg_r3_c0.png
+2025-03-04 18:30:05,808 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_27.jpg_r3_c0.png
+2025-03-04 18:30:05,809 [INFO] __main__ - Processing table image: /topic-extraction/img_28.jpg, columns=three
+2025-03-04 18:30:08,340 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_28.jpg_r0_c0.png
+2025-03-04 18:30:09,205 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_28.jpg_r0_c0.png
+2025-03-04 18:30:09,541 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_28.jpg_r1_c0.png
+2025-03-04 18:30:11,786 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_28.jpg_r2_c0.png
+2025-03-04 18:30:12,603 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_28.jpg_r2_c0.png
+2025-03-04 18:30:12,603 [INFO] __main__ - Processing table image: /topic-extraction/img_29.jpg, columns=three
+2025-03-04 18:30:14,423 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_29.jpg_r0_c0.png
+2025-03-04 18:30:15,408 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_29.jpg_r0_c0.png
+2025-03-04 18:30:15,669 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_29.jpg_r1_c0.png
+2025-03-04 18:30:18,844 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/img_29.jpg_r2_c0.png
+2025-03-04 18:30:20,616 [INFO] __main__ - Deleted empty cell image from S3: /topic-extraction/cells/img_29.jpg_r2_c0.png
+2025-03-04 18:30:20,620 [INFO] __main__ - Final subtopics JSON saved locally at /home/user/app/pearson_json/_subtopics.json
+2025-03-04 18:30:20,956 [INFO] __main__ - GPU memory cleaned up.
+2025-03-04 18:30:20,961 [INFO] __main__ - Processing completed successfully.