SkyNait commited on
Commit
a6a7c69
·
1 Parent(s): 8966134
__pycache__/mineru_test_local.cpython-310.pyc CHANGED
Binary files a/__pycache__/mineru_test_local.cpython-310.pyc and b/__pycache__/mineru_test_local.cpython-310.pyc differ
 
__pycache__/table_row_extraction.cpython-310.pyc ADDED
Binary file (10.9 kB). View file
 
inference_svm_model.py CHANGED
@@ -1,212 +1,31 @@
1
  #!/usr/bin/env python3
 
 
2
  import os
3
- import re
4
- import json
5
- import logging
6
- import fitz # PyMuPDF
7
- from typing import Optional, Tuple, Dict, List
8
 
9
- from contents_extractor_v2 import ContentsExtractor
10
- from mineru_test_local import LocalPDFProcessor
11
 
12
- # Configure logging
13
- logging.basicConfig(
14
- level=logging.INFO,
15
- format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
16
- handlers=[
17
- logging.StreamHandler(),
18
- logging.FileHandler('selective_pdf_extractor.log')
19
- ]
20
- )
21
- logger = logging.getLogger(__name__)
22
- logger.setLevel(logging.INFO)
23
 
24
- class SelectivePDFProcessor:
25
- """
26
- Processes PDF files by extracting only subject content sections.
27
- First identifies if it's a specification document, then finds the Contents page,
28
- extracts subject content page ranges, and processes only those pages.
29
- """
30
-
31
- def __init__(self, output_folder: str, api_key: str):
32
- self.output_folder = output_folder
33
- os.makedirs(self.output_folder, exist_ok=True)
34
- self.api_key = api_key
35
- self.contents_extractor = ContentsExtractor(api_key=api_key)
36
- self.pdf_processor = LocalPDFProcessor(output_folder=output_folder)
37
-
38
- def check_for_specification(self, pdf_path: str) -> bool:
39
- """
40
- Checks if the PDF is a specification document by looking for the word 'specification'
41
- on the first page.
42
- """
43
- try:
44
- doc = fitz.open(pdf_path)
45
- first_page_text = doc[0].get_text().lower()
46
- doc.close()
47
- return 'specification' in first_page_text
48
- except Exception as e:
49
- logger.error(f"Error checking for specification: {e}")
50
- return False
51
-
52
- def find_contents_page(self, pdf_path: str) -> Optional[int]:
53
- """
54
- Finds the page number of the Contents section.
55
- """
56
- try:
57
- doc = fitz.open(pdf_path)
58
- # Check first 20 pages for "Contents"
59
- # (assuming Contents is within the first 20 pages)
60
- max_pages = min(20, doc.page_count)
61
-
62
- for page_num in range(max_pages):
63
- page_text = doc[page_num].get_text()
64
- # Look for "Contents" as a standalone heading
65
- if re.search(r'^\s*Contents\s*$', page_text, re.MULTILINE):
66
- logger.info(f"Found Contents page at page {page_num}")
67
- doc.close()
68
- return page_num
69
-
70
- doc.close()
71
- logger.warning("Contents page not found")
72
- return None
73
- except Exception as e:
74
- logger.error(f"Error finding contents page: {e}")
75
- return None
76
-
77
- def extract_subject_content_pages(self, pdf_path: str, contents_page: int) -> Optional[Tuple[int, int]]:
78
- """
79
- Extracts subject content page range using the ContentsExtractor.
80
- Focuses on "Subject content" section.
81
- """
82
- try:
83
- doc = fitz.open(pdf_path)
84
- contents_text = doc[contents_page].get_text()
85
- doc.close()
86
-
87
- # Use the ContentsExtractor to parse the Contents page
88
- json_result = self.contents_extractor.extract_contents(contents_text)
89
- topics_dict = json.loads(json_result)
90
-
91
- # Look for subject content topics (with variations in naming)
92
- subject_content_key = None
93
- for key in topics_dict:
94
- if 'subject content' in key.lower():
95
- subject_content_key = key
96
- break
97
-
98
- if subject_content_key:
99
- start_page, end_page = topics_dict[subject_content_key]
100
- logger.info(f"Found subject content pages: {start_page} to {end_page}")
101
- return start_page, end_page
102
- else:
103
- logger.warning("Subject content section not found in contents")
104
- return None
105
- except Exception as e:
106
- logger.error(f"Error extracting subject content pages: {e}")
107
- return None
108
-
109
- def extract_pages_to_new_pdf(self, input_pdf: str, start_page: int, end_page: int) -> str:
110
- """
111
- Creates a new PDF containing only the specified page range.
112
- """
113
- try:
114
- doc = fitz.open(input_pdf)
115
- new_doc = fitz.open()
116
-
117
- # Convert from page numbers in contents (1-based) to 0-based indices
118
- start_idx = start_page - 1
119
- end_idx = end_page - 1
120
-
121
- # Ensure valid page range
122
- start_idx = max(0, start_idx)
123
- end_idx = min(doc.page_count - 1, end_idx)
124
-
125
- # Copy pages from original to new document
126
- for page_num in range(start_idx, end_idx + 1):
127
- new_doc.insert_pdf(doc, from_page=page_num, to_page=page_num)
128
-
129
- # Save new PDF
130
- temp_pdf_path = os.path.join(self.output_folder, "subject_content.pdf")
131
- new_doc.save(temp_pdf_path)
132
- new_doc.close()
133
- doc.close()
134
-
135
- logger.info(f"Created new PDF with pages {start_page} to {end_page} at {temp_pdf_path}")
136
- return temp_pdf_path
137
- except Exception as e:
138
- logger.error(f"Error extracting pages to new PDF: {e}")
139
- return input_pdf # Return original if extraction fails
140
-
141
- def process(self, pdf_path: str) -> Optional[str]:
142
- """
143
- Main processing function:
144
- 1. Check if PDF is a specification document
145
- 2. Find the Contents page
146
- 3. Extract subject content page range
147
- 4. Create a new PDF with only those pages
148
- 5. Process the new PDF using the existing PDF processor
149
- """
150
- try:
151
- # Check if it's a specification document
152
- is_spec = self.check_for_specification(pdf_path)
153
- if not is_spec:
154
- logger.info(f"Not a specification document, processing entire PDF: {pdf_path}")
155
- return self.pdf_processor.process(pdf_path)
156
-
157
- # Find the Contents page
158
- contents_page = self.find_contents_page(pdf_path)
159
- if contents_page is None:
160
- logger.warning("Contents page not found, processing entire PDF")
161
- return self.pdf_processor.process(pdf_path)
162
-
163
- # Extract subject content page range
164
- page_range = self.extract_subject_content_pages(pdf_path, contents_page)
165
- if page_range is None:
166
- logger.warning("Subject content section not found, processing entire PDF")
167
- return self.pdf_processor.process(pdf_path)
168
-
169
- start_page, end_page = page_range
170
-
171
- # Create new PDF with only subject content pages
172
- subject_content_pdf = self.extract_pages_to_new_pdf(pdf_path, start_page, end_page)
173
-
174
- # Process the new PDF
175
- logger.info(f"Processing subject content PDF: {subject_content_pdf}")
176
- markdown_result = self.pdf_processor.process(subject_content_pdf)
177
-
178
- # Add metadata about the extraction
179
- metadata = (
180
- f"# Extracted Subject Content\n\n"
181
- f"Source document: {os.path.basename(pdf_path)}\n"
182
- f"Pages: {start_page} to {end_page}\n\n"
183
- f"---\n\n"
184
- )
185
-
186
- final_markdown = metadata + markdown_result
187
-
188
- # Save the final markdown
189
- final_md_path = os.path.join(self.output_folder, "final_output_with_metadata.md")
190
- with open(final_md_path, "w", encoding="utf-8") as f:
191
- f.write(final_markdown)
192
-
193
- return final_markdown
194
- except Exception as e:
195
- logger.error(f"Error in selective processing: {e}")
196
- # Fallback to processing the entire PDF
197
- return self.pdf_processor.process(pdf_path)
198
 
199
  if __name__ == "__main__":
200
- # API key should be stored securely, this is just for demonstration
201
- GEMINI_API_KEY = "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU" # Same as in the original scripts
202
-
203
- input_pdf = "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf"
204
- output_dir = "/home/user/app/input_output/outputs"
205
-
206
- processor = SelectivePDFProcessor(output_folder=output_dir, api_key=GEMINI_API_KEY)
207
- result = processor.process(input_pdf)
208
-
209
- if result:
210
- logger.info("Processing completed successfully")
211
- else:
212
- logger.error("Processing failed")
 
1
  #!/usr/bin/env python3
2
+ import cv2
3
+ import numpy as np
4
  import os
5
+ from joblib import load
 
 
 
 
6
 
 
 
7
 
8
+ class SVMModel:
9
+ def __init__(self):
10
+ path = os.getenv("SVM_MODEL_PATH", "/home/user/app/model_classification/svm_model.joblib")
11
+ self.model = load(path)
 
 
 
 
 
 
 
12
 
13
+ def classify_image(
14
+ self,
15
+ image_bytes: bytes,
16
+ image_size=(128, 128)
17
+ ) -> int:
18
+ img = cv2.imdecode(np.frombuffer(image_bytes, np.uint8), cv2.IMREAD_COLOR)
19
+ if img is None:
20
+ # If image fails to load, default to "irrelevant" or handle differently
21
+ return 0
22
+
23
+ img = cv2.resize(img, image_size)
24
+ x = img.flatten().reshape(1, -1)
25
+ pred = self.model.predict(x)[0]
26
+ return pred
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  if __name__ == "__main__":
29
+ model = load_svm_model("/home/user/app/model_classification/svm_model_2.joblib")
30
+ result = classify_image("test.jpg", model)
31
+ print("Classification result:", result)
 
 
 
 
 
 
 
 
 
 
selective_pdf_extractor.log ADDED
File without changes