MinerU

Paused

App Files Files Community

SkyNait commited on Feb 26

Commit

a6a7c69

1 Parent(s): 8966134

test

Browse files

Files changed (4) hide show

__pycache__/mineru_test_local.cpython-310.pyc +0 -0
__pycache__/table_row_extraction.cpython-310.pyc +0 -0
inference_svm_model.py +24 -205
selective_pdf_extractor.log +0 -0

__pycache__/mineru_test_local.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/mineru_test_local.cpython-310.pyc and b/__pycache__/mineru_test_local.cpython-310.pyc differ

__pycache__/table_row_extraction.cpython-310.pyc ADDED Viewed

Binary file (10.9 kB). View file

inference_svm_model.py CHANGED Viewed

@@ -1,212 +1,31 @@
 #!/usr/bin/env python3
 import os
-import re
-import json
-import logging
-import fitz  # PyMuPDF
-from typing import Optional, Tuple, Dict, List
-from contents_extractor_v2 import ContentsExtractor
-from mineru_test_local import LocalPDFProcessor
-# Configure logging
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
-    handlers=[
-        logging.StreamHandler(),
-        logging.FileHandler('selective_pdf_extractor.log')
-    ]
-)
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-class SelectivePDFProcessor:
-    """
-    Processes PDF files by extracting only subject content sections.
-    First identifies if it's a specification document, then finds the Contents page,
-    extracts subject content page ranges, and processes only those pages.
-    """
-    def __init__(self, output_folder: str, api_key: str):
-        self.output_folder = output_folder
-        os.makedirs(self.output_folder, exist_ok=True)
-        self.api_key = api_key
-        self.contents_extractor = ContentsExtractor(api_key=api_key)
-        self.pdf_processor = LocalPDFProcessor(output_folder=output_folder)
-    def check_for_specification(self, pdf_path: str) -> bool:
-        """
-        Checks if the PDF is a specification document by looking for the word 'specification'
-        on the first page.
-        """
-        try:
-            doc = fitz.open(pdf_path)
-            first_page_text = doc[0].get_text().lower()
-            doc.close()
-            return 'specification' in first_page_text
-        except Exception as e:
-            logger.error(f"Error checking for specification: {e}")
-            return False
-    def find_contents_page(self, pdf_path: str) -> Optional[int]:
-        """
-        Finds the page number of the Contents section.
-        """
-        try:
-            doc = fitz.open(pdf_path)
-            # Check first 20 pages for "Contents"
-            # (assuming Contents is within the first 20 pages)
-            max_pages = min(20, doc.page_count)
-            for page_num in range(max_pages):
-                page_text = doc[page_num].get_text()
-                # Look for "Contents" as a standalone heading
-                if re.search(r'^\s*Contents\s*$', page_text, re.MULTILINE):
-                    logger.info(f"Found Contents page at page {page_num}")
-                    doc.close()
-                    return page_num
-            doc.close()
-            logger.warning("Contents page not found")
-            return None
-        except Exception as e:
-            logger.error(f"Error finding contents page: {e}")
-            return None
-    def extract_subject_content_pages(self, pdf_path: str, contents_page: int) -> Optional[Tuple[int, int]]:
-        """
-        Extracts subject content page range using the ContentsExtractor.
-        Focuses on "Subject content" section.
-        """
-        try:
-            doc = fitz.open(pdf_path)
-            contents_text = doc[contents_page].get_text()
-            doc.close()
-            # Use the ContentsExtractor to parse the Contents page
-            json_result = self.contents_extractor.extract_contents(contents_text)
-            topics_dict = json.loads(json_result)
-            # Look for subject content topics (with variations in naming)
-            subject_content_key = None
-            for key in topics_dict:
-                if 'subject content' in key.lower():
-                    subject_content_key = key
-                    break
-            if subject_content_key:
-                start_page, end_page = topics_dict[subject_content_key]
-                logger.info(f"Found subject content pages: {start_page} to {end_page}")
-                return start_page, end_page
-            else:
-                logger.warning("Subject content section not found in contents")
-                return None
-        except Exception as e:
-            logger.error(f"Error extracting subject content pages: {e}")
-            return None
-    def extract_pages_to_new_pdf(self, input_pdf: str, start_page: int, end_page: int) -> str:
-        """
-        Creates a new PDF containing only the specified page range.
-        """
-        try:
-            doc = fitz.open(input_pdf)
-            new_doc = fitz.open()
-            # Convert from page numbers in contents (1-based) to 0-based indices
-            start_idx = start_page - 1
-            end_idx = end_page - 1
-            # Ensure valid page range
-            start_idx = max(0, start_idx)
-            end_idx = min(doc.page_count - 1, end_idx)
-            # Copy pages from original to new document
-            for page_num in range(start_idx, end_idx + 1):
-                new_doc.insert_pdf(doc, from_page=page_num, to_page=page_num)
-            # Save new PDF
-            temp_pdf_path = os.path.join(self.output_folder, "subject_content.pdf")
-            new_doc.save(temp_pdf_path)
-            new_doc.close()
-            doc.close()
-            logger.info(f"Created new PDF with pages {start_page} to {end_page} at {temp_pdf_path}")
-            return temp_pdf_path
-        except Exception as e:
-            logger.error(f"Error extracting pages to new PDF: {e}")
-            return input_pdf  # Return original if extraction fails
-    def process(self, pdf_path: str) -> Optional[str]:
-        """
-        Main processing function:
-        1. Check if PDF is a specification document
-        2. Find the Contents page
-        3. Extract subject content page range
-        4. Create a new PDF with only those pages
-        5. Process the new PDF using the existing PDF processor
-        """
-        try:
-            # Check if it's a specification document
-            is_spec = self.check_for_specification(pdf_path)
-            if not is_spec:
-                logger.info(f"Not a specification document, processing entire PDF: {pdf_path}")
-                return self.pdf_processor.process(pdf_path)
-            # Find the Contents page
-            contents_page = self.find_contents_page(pdf_path)
-            if contents_page is None:
-                logger.warning("Contents page not found, processing entire PDF")
-                return self.pdf_processor.process(pdf_path)
-            # Extract subject content page range
-            page_range = self.extract_subject_content_pages(pdf_path, contents_page)
-            if page_range is None:
-                logger.warning("Subject content section not found, processing entire PDF")
-                return self.pdf_processor.process(pdf_path)
-            start_page, end_page = page_range
-            # Create new PDF with only subject content pages
-            subject_content_pdf = self.extract_pages_to_new_pdf(pdf_path, start_page, end_page)
-            # Process the new PDF
-            logger.info(f"Processing subject content PDF: {subject_content_pdf}")
-            markdown_result = self.pdf_processor.process(subject_content_pdf)
-            # Add metadata about the extraction
-            metadata = (
-                f"# Extracted Subject Content\n\n"
-                f"Source document: {os.path.basename(pdf_path)}\n"
-                f"Pages: {start_page} to {end_page}\n\n"
-                f"---\n\n"
-            )
-            final_markdown = metadata + markdown_result
-            # Save the final markdown
-            final_md_path = os.path.join(self.output_folder, "final_output_with_metadata.md")
-            with open(final_md_path, "w", encoding="utf-8") as f:
-                f.write(final_markdown)
-            return final_markdown
-        except Exception as e:
-            logger.error(f"Error in selective processing: {e}")
-            # Fallback to processing the entire PDF
-            return self.pdf_processor.process(pdf_path)
 if __name__ == "__main__":
-    # API key should be stored securely, this is just for demonstration
-    GEMINI_API_KEY = "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU"  # Same as in the original scripts
-    input_pdf = "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf"
-    output_dir = "/home/user/app/input_output/outputs"
-    processor = SelectivePDFProcessor(output_folder=output_dir, api_key=GEMINI_API_KEY)
-    result = processor.process(input_pdf)
-    if result:
-        logger.info("Processing completed successfully")
-    else:
-        logger.error("Processing failed")

 #!/usr/bin/env python3
+import cv2
+import numpy as np
 import os
+from joblib import load
+class SVMModel:
+    def __init__(self):
+        path = os.getenv("SVM_MODEL_PATH", "/home/user/app/model_classification/svm_model.joblib")
+        self.model = load(path)
+    def classify_image(
+        self,
+        image_bytes: bytes,
+        image_size=(128, 128)
+    ) -> int:
+        img = cv2.imdecode(np.frombuffer(image_bytes, np.uint8), cv2.IMREAD_COLOR)
+        if img is None:
+            # If image fails to load, default to "irrelevant" or handle differently
+            return 0
+        img = cv2.resize(img, image_size)
+        x = img.flatten().reshape(1, -1)
+        pred = self.model.predict(x)[0]
+        return pred
 if __name__ == "__main__":
+    model = load_svm_model("/home/user/app/model_classification/svm_model_2.joblib")
+    result = classify_image("test.jpg", model)
+    print("Classification result:", result)

selective_pdf_extractor.log ADDED Viewed

File without changes