test
Browse files
__pycache__/mineru_test_local.cpython-310.pyc
CHANGED
Binary files a/__pycache__/mineru_test_local.cpython-310.pyc and b/__pycache__/mineru_test_local.cpython-310.pyc differ
|
|
__pycache__/table_row_extraction.cpython-310.pyc
ADDED
Binary file (10.9 kB). View file
|
|
inference_svm_model.py
CHANGED
@@ -1,212 +1,31 @@
|
|
1 |
#!/usr/bin/env python3
|
|
|
|
|
2 |
import os
|
3 |
-
import
|
4 |
-
import json
|
5 |
-
import logging
|
6 |
-
import fitz # PyMuPDF
|
7 |
-
from typing import Optional, Tuple, Dict, List
|
8 |
|
9 |
-
from contents_extractor_v2 import ContentsExtractor
|
10 |
-
from mineru_test_local import LocalPDFProcessor
|
11 |
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
handlers=[
|
17 |
-
logging.StreamHandler(),
|
18 |
-
logging.FileHandler('selective_pdf_extractor.log')
|
19 |
-
]
|
20 |
-
)
|
21 |
-
logger = logging.getLogger(__name__)
|
22 |
-
logger.setLevel(logging.INFO)
|
23 |
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
def check_for_specification(self, pdf_path: str) -> bool:
|
39 |
-
"""
|
40 |
-
Checks if the PDF is a specification document by looking for the word 'specification'
|
41 |
-
on the first page.
|
42 |
-
"""
|
43 |
-
try:
|
44 |
-
doc = fitz.open(pdf_path)
|
45 |
-
first_page_text = doc[0].get_text().lower()
|
46 |
-
doc.close()
|
47 |
-
return 'specification' in first_page_text
|
48 |
-
except Exception as e:
|
49 |
-
logger.error(f"Error checking for specification: {e}")
|
50 |
-
return False
|
51 |
-
|
52 |
-
def find_contents_page(self, pdf_path: str) -> Optional[int]:
|
53 |
-
"""
|
54 |
-
Finds the page number of the Contents section.
|
55 |
-
"""
|
56 |
-
try:
|
57 |
-
doc = fitz.open(pdf_path)
|
58 |
-
# Check first 20 pages for "Contents"
|
59 |
-
# (assuming Contents is within the first 20 pages)
|
60 |
-
max_pages = min(20, doc.page_count)
|
61 |
-
|
62 |
-
for page_num in range(max_pages):
|
63 |
-
page_text = doc[page_num].get_text()
|
64 |
-
# Look for "Contents" as a standalone heading
|
65 |
-
if re.search(r'^\s*Contents\s*$', page_text, re.MULTILINE):
|
66 |
-
logger.info(f"Found Contents page at page {page_num}")
|
67 |
-
doc.close()
|
68 |
-
return page_num
|
69 |
-
|
70 |
-
doc.close()
|
71 |
-
logger.warning("Contents page not found")
|
72 |
-
return None
|
73 |
-
except Exception as e:
|
74 |
-
logger.error(f"Error finding contents page: {e}")
|
75 |
-
return None
|
76 |
-
|
77 |
-
def extract_subject_content_pages(self, pdf_path: str, contents_page: int) -> Optional[Tuple[int, int]]:
|
78 |
-
"""
|
79 |
-
Extracts subject content page range using the ContentsExtractor.
|
80 |
-
Focuses on "Subject content" section.
|
81 |
-
"""
|
82 |
-
try:
|
83 |
-
doc = fitz.open(pdf_path)
|
84 |
-
contents_text = doc[contents_page].get_text()
|
85 |
-
doc.close()
|
86 |
-
|
87 |
-
# Use the ContentsExtractor to parse the Contents page
|
88 |
-
json_result = self.contents_extractor.extract_contents(contents_text)
|
89 |
-
topics_dict = json.loads(json_result)
|
90 |
-
|
91 |
-
# Look for subject content topics (with variations in naming)
|
92 |
-
subject_content_key = None
|
93 |
-
for key in topics_dict:
|
94 |
-
if 'subject content' in key.lower():
|
95 |
-
subject_content_key = key
|
96 |
-
break
|
97 |
-
|
98 |
-
if subject_content_key:
|
99 |
-
start_page, end_page = topics_dict[subject_content_key]
|
100 |
-
logger.info(f"Found subject content pages: {start_page} to {end_page}")
|
101 |
-
return start_page, end_page
|
102 |
-
else:
|
103 |
-
logger.warning("Subject content section not found in contents")
|
104 |
-
return None
|
105 |
-
except Exception as e:
|
106 |
-
logger.error(f"Error extracting subject content pages: {e}")
|
107 |
-
return None
|
108 |
-
|
109 |
-
def extract_pages_to_new_pdf(self, input_pdf: str, start_page: int, end_page: int) -> str:
|
110 |
-
"""
|
111 |
-
Creates a new PDF containing only the specified page range.
|
112 |
-
"""
|
113 |
-
try:
|
114 |
-
doc = fitz.open(input_pdf)
|
115 |
-
new_doc = fitz.open()
|
116 |
-
|
117 |
-
# Convert from page numbers in contents (1-based) to 0-based indices
|
118 |
-
start_idx = start_page - 1
|
119 |
-
end_idx = end_page - 1
|
120 |
-
|
121 |
-
# Ensure valid page range
|
122 |
-
start_idx = max(0, start_idx)
|
123 |
-
end_idx = min(doc.page_count - 1, end_idx)
|
124 |
-
|
125 |
-
# Copy pages from original to new document
|
126 |
-
for page_num in range(start_idx, end_idx + 1):
|
127 |
-
new_doc.insert_pdf(doc, from_page=page_num, to_page=page_num)
|
128 |
-
|
129 |
-
# Save new PDF
|
130 |
-
temp_pdf_path = os.path.join(self.output_folder, "subject_content.pdf")
|
131 |
-
new_doc.save(temp_pdf_path)
|
132 |
-
new_doc.close()
|
133 |
-
doc.close()
|
134 |
-
|
135 |
-
logger.info(f"Created new PDF with pages {start_page} to {end_page} at {temp_pdf_path}")
|
136 |
-
return temp_pdf_path
|
137 |
-
except Exception as e:
|
138 |
-
logger.error(f"Error extracting pages to new PDF: {e}")
|
139 |
-
return input_pdf # Return original if extraction fails
|
140 |
-
|
141 |
-
def process(self, pdf_path: str) -> Optional[str]:
|
142 |
-
"""
|
143 |
-
Main processing function:
|
144 |
-
1. Check if PDF is a specification document
|
145 |
-
2. Find the Contents page
|
146 |
-
3. Extract subject content page range
|
147 |
-
4. Create a new PDF with only those pages
|
148 |
-
5. Process the new PDF using the existing PDF processor
|
149 |
-
"""
|
150 |
-
try:
|
151 |
-
# Check if it's a specification document
|
152 |
-
is_spec = self.check_for_specification(pdf_path)
|
153 |
-
if not is_spec:
|
154 |
-
logger.info(f"Not a specification document, processing entire PDF: {pdf_path}")
|
155 |
-
return self.pdf_processor.process(pdf_path)
|
156 |
-
|
157 |
-
# Find the Contents page
|
158 |
-
contents_page = self.find_contents_page(pdf_path)
|
159 |
-
if contents_page is None:
|
160 |
-
logger.warning("Contents page not found, processing entire PDF")
|
161 |
-
return self.pdf_processor.process(pdf_path)
|
162 |
-
|
163 |
-
# Extract subject content page range
|
164 |
-
page_range = self.extract_subject_content_pages(pdf_path, contents_page)
|
165 |
-
if page_range is None:
|
166 |
-
logger.warning("Subject content section not found, processing entire PDF")
|
167 |
-
return self.pdf_processor.process(pdf_path)
|
168 |
-
|
169 |
-
start_page, end_page = page_range
|
170 |
-
|
171 |
-
# Create new PDF with only subject content pages
|
172 |
-
subject_content_pdf = self.extract_pages_to_new_pdf(pdf_path, start_page, end_page)
|
173 |
-
|
174 |
-
# Process the new PDF
|
175 |
-
logger.info(f"Processing subject content PDF: {subject_content_pdf}")
|
176 |
-
markdown_result = self.pdf_processor.process(subject_content_pdf)
|
177 |
-
|
178 |
-
# Add metadata about the extraction
|
179 |
-
metadata = (
|
180 |
-
f"# Extracted Subject Content\n\n"
|
181 |
-
f"Source document: {os.path.basename(pdf_path)}\n"
|
182 |
-
f"Pages: {start_page} to {end_page}\n\n"
|
183 |
-
f"---\n\n"
|
184 |
-
)
|
185 |
-
|
186 |
-
final_markdown = metadata + markdown_result
|
187 |
-
|
188 |
-
# Save the final markdown
|
189 |
-
final_md_path = os.path.join(self.output_folder, "final_output_with_metadata.md")
|
190 |
-
with open(final_md_path, "w", encoding="utf-8") as f:
|
191 |
-
f.write(final_markdown)
|
192 |
-
|
193 |
-
return final_markdown
|
194 |
-
except Exception as e:
|
195 |
-
logger.error(f"Error in selective processing: {e}")
|
196 |
-
# Fallback to processing the entire PDF
|
197 |
-
return self.pdf_processor.process(pdf_path)
|
198 |
|
199 |
if __name__ == "__main__":
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
input_pdf = "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf"
|
204 |
-
output_dir = "/home/user/app/input_output/outputs"
|
205 |
-
|
206 |
-
processor = SelectivePDFProcessor(output_folder=output_dir, api_key=GEMINI_API_KEY)
|
207 |
-
result = processor.process(input_pdf)
|
208 |
-
|
209 |
-
if result:
|
210 |
-
logger.info("Processing completed successfully")
|
211 |
-
else:
|
212 |
-
logger.error("Processing failed")
|
|
|
1 |
#!/usr/bin/env python3
|
2 |
+
import cv2
|
3 |
+
import numpy as np
|
4 |
import os
|
5 |
+
from joblib import load
|
|
|
|
|
|
|
|
|
6 |
|
|
|
|
|
7 |
|
8 |
+
class SVMModel:
|
9 |
+
def __init__(self):
|
10 |
+
path = os.getenv("SVM_MODEL_PATH", "/home/user/app/model_classification/svm_model.joblib")
|
11 |
+
self.model = load(path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
+
def classify_image(
|
14 |
+
self,
|
15 |
+
image_bytes: bytes,
|
16 |
+
image_size=(128, 128)
|
17 |
+
) -> int:
|
18 |
+
img = cv2.imdecode(np.frombuffer(image_bytes, np.uint8), cv2.IMREAD_COLOR)
|
19 |
+
if img is None:
|
20 |
+
# If image fails to load, default to "irrelevant" or handle differently
|
21 |
+
return 0
|
22 |
+
|
23 |
+
img = cv2.resize(img, image_size)
|
24 |
+
x = img.flatten().reshape(1, -1)
|
25 |
+
pred = self.model.predict(x)[0]
|
26 |
+
return pred
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
if __name__ == "__main__":
|
29 |
+
model = load_svm_model("/home/user/app/model_classification/svm_model_2.joblib")
|
30 |
+
result = classify_image("test.jpg", model)
|
31 |
+
print("Classification result:", result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
selective_pdf_extractor.log
ADDED
File without changes
|