SkyNait commited on
Commit
41b09be
·
1 Parent(s): 91de769
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. __pycache__/contents_extractor_v2.cpython-310.pyc +0 -0
  2. __pycache__/inference_svm_model.cpython-310.pyc +0 -0
  3. __pycache__/mineru_single.cpython-310.pyc +0 -0
  4. __pycache__/mineru_test_local.cpython-310.pyc +0 -0
  5. __pycache__/topic_extraction_upgrade.cpython-310.pyc +0 -0
  6. __pycache__/worker.cpython-310.pyc +0 -0
  7. contents_extractor_v2.py +110 -0
  8. inference_svm_model.py +205 -24
  9. input_output/output/final_output.md +0 -0
  10. input_output/output/images/img_1.png +0 -0
  11. input_output/output/images/img_10.png +0 -0
  12. input_output/output/images/img_10.png_rows/row_0/col_0.png +0 -0
  13. input_output/output/images/img_10.png_rows/row_0/col_2.png +0 -0
  14. input_output/output/images/img_11.png +0 -0
  15. input_output/output/images/img_11.png_rows/row_0/col_0.png +0 -0
  16. input_output/output/images/img_11.png_rows/row_0/col_1.png +0 -0
  17. input_output/output/images/img_11.png_rows/row_1/col_0.png +0 -0
  18. input_output/output/images/img_11.png_rows/row_1/col_1.png +0 -0
  19. input_output/output/images/img_11.png_rows/row_2/col_0.png +0 -0
  20. input_output/output/images/img_11.png_rows/row_3/col_0.png +0 -0
  21. input_output/output/images/img_12.png +0 -0
  22. input_output/output/images/img_12.png_rows/row_0/col_0.png +0 -0
  23. input_output/output/images/img_12.png_rows/row_0/col_1.png +0 -0
  24. input_output/output/images/img_12.png_rows/row_1/col_0.png +0 -0
  25. input_output/output/images/img_12.png_rows/row_1/col_1.png +0 -0
  26. input_output/output/images/img_12.png_rows/row_2/col_0.png +0 -0
  27. input_output/output/images/img_12.png_rows/row_2/col_1.png +0 -0
  28. input_output/output/images/img_12.png_rows/row_3/col_0.png +0 -0
  29. input_output/output/images/img_13.png +0 -0
  30. input_output/output/images/img_13.png_rows/row_0/col_0.png +0 -0
  31. input_output/output/images/img_13.png_rows/row_0/col_1.png +0 -0
  32. input_output/output/images/img_13.png_rows/row_1/col_0.png +0 -0
  33. input_output/output/images/img_13.png_rows/row_1/col_1.png +0 -0
  34. input_output/output/images/img_13.png_rows/row_2/col_0.png +0 -0
  35. input_output/output/images/img_13.png_rows/row_3/col_0.png +0 -0
  36. input_output/output/images/img_13.png_rows/row_3/col_1.png +0 -0
  37. input_output/output/images/img_14.png +0 -0
  38. input_output/output/images/img_14.png_rows/row_0/col_0.png +0 -0
  39. input_output/output/images/img_14.png_rows/row_0/col_1.png +0 -0
  40. input_output/output/images/img_14.png_rows/row_1/col_0.png +0 -0
  41. input_output/output/images/img_14.png_rows/row_1/col_1.png +0 -0
  42. input_output/output/images/img_14.png_rows/row_2/col_0.png +0 -0
  43. input_output/output/images/img_14.png_rows/row_3/col_0.png +0 -0
  44. input_output/output/images/img_14.png_rows/row_4/col_0.png +0 -0
  45. input_output/output/images/img_14.png_rows/row_5/col_0.png +0 -0
  46. input_output/output/images/img_15.png +0 -0
  47. input_output/output/images/img_15.png_rows/row_0/col_0.png +0 -0
  48. input_output/output/images/img_15.png_rows/row_0/col_1.png +0 -0
  49. input_output/output/images/img_15.png_rows/row_1/col_0.png +0 -0
  50. input_output/output/images/img_15.png_rows/row_1/col_1.png +0 -0
__pycache__/contents_extractor_v2.cpython-310.pyc ADDED
Binary file (5.09 kB). View file
 
__pycache__/inference_svm_model.cpython-310.pyc CHANGED
Binary files a/__pycache__/inference_svm_model.cpython-310.pyc and b/__pycache__/inference_svm_model.cpython-310.pyc differ
 
__pycache__/mineru_single.cpython-310.pyc CHANGED
Binary files a/__pycache__/mineru_single.cpython-310.pyc and b/__pycache__/mineru_single.cpython-310.pyc differ
 
__pycache__/mineru_test_local.cpython-310.pyc ADDED
Binary file (11.9 kB). View file
 
__pycache__/topic_extraction_upgrade.cpython-310.pyc CHANGED
Binary files a/__pycache__/topic_extraction_upgrade.cpython-310.pyc and b/__pycache__/topic_extraction_upgrade.cpython-310.pyc differ
 
__pycache__/worker.cpython-310.pyc CHANGED
Binary files a/__pycache__/worker.cpython-310.pyc and b/__pycache__/worker.cpython-310.pyc differ
 
contents_extractor_v2.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from google import genai
2
+ from google.genai import types
3
+ import fitz
4
+ import requests
5
+
6
+ MODEL = "gemini-2.0-flash"
7
+
8
+ # TODO: Make sure the last page must be included
9
+
10
+
11
+ class ContentsExtractor:
12
+ def __init__(self, api_key: str):
13
+ self.client = genai.Client(api_key=api_key)
14
+
15
+ @staticmethod
16
+ def extract_first_pages(pdf_path, num_pages=4, is_path_url=False):
17
+ try:
18
+ if is_path_url:
19
+ r = requests.get(pdf_path)
20
+ data = r.content
21
+ doc = fitz.open(stream=data, filetype="pdf")
22
+ else:
23
+ doc = fitz.open(pdf_path)
24
+ total_pages = doc.page_count
25
+ pages_to_read = min(total_pages, num_pages)
26
+ all_text = []
27
+ for page_num in range(pages_to_read):
28
+ page = doc[page_num]
29
+ page_text = page.get_text()
30
+ all_text.append(page_text)
31
+ doc.close()
32
+ return "\n".join(all_text)
33
+ except Exception as e:
34
+ print(f"Something went wrong: {e}")
35
+ return None
36
+
37
+ def extract_contents(self, content):
38
+ response = self.client.models.generate_content(
39
+ model=MODEL,
40
+ contents=[f"""
41
+ Task:
42
+ You will be provided with the first pages of an exam board document. Your goal is to extract
43
+ the main subject-related topics from the "Contents" section and structure them in a valid JSON format.
44
+
45
+ Instructions:
46
+ 1. Identify the "Contents" section, which lists all topics, subtopics, and their corresponding pages.
47
+ 2. Extract only the **highest-level, subject-related topics** (ignore organizational or administrative sections).
48
+ 3. If a topic has subtopics, include the full range of pages from the first to the last subtopic.
49
+ 4. Return the output in the following JSON format:
50
+
51
+ {{
52
+ "topic_name": [start_page, end_page]
53
+ }}
54
+
55
+ Important Notes:
56
+ - Ignore non-subject-related sections (e.g., "Introduction", "Exam Guidelines", "Appendices").
57
+ - If a topic has subtopics, **only extract the main topic**, ensuring the page range covers all subtopics.
58
+ - The extracted topics should represent major academic areas, not organizational or structural elements.
59
+ - Only extract main topics without sub-topic numeration. Any topic with additional numbering (e.g., '3.1 Some Topic')
60
+ should be ignored, as it is a sub-topic rather than a primary subject-related topic.
61
+ - Make sure that all of the pages for a topic are included, end page should be the start page of the topic
62
+ that comes next after the extracted one in contents section.
63
+
64
+ Examples:
65
+ 1. Given this table of contents:
66
+
67
+ 1. Introduction - 1
68
+ 2. Exam Rules - 4
69
+ 3. Subject content - 8
70
+ 3.1 Algebra - 12
71
+ 3.2 Geometry - 16
72
+ 3.3 Probability - 20
73
+ 4. The topics of subject of physics - 25
74
+ 4.1 Mechanics - 30
75
+ 4.2 Thermodynamics - 35
76
+ 5. Appendices - 40
77
+
78
+ The correct output should be:
79
+
80
+ {{
81
+ "3. Subject content": [8, 25],
82
+ "4. The topics of subject of physics": [25, 40]
83
+ }}
84
+
85
+ 2. Given this table of contents:
86
+
87
+ 1. Welcome Note - 1
88
+ 2. Exam Overview - 3
89
+ 3. Biology - 5
90
+ 3.1 Cell Biology - 7
91
+ 3.2 Genetics - 12
92
+ 3.3 Ecology - 18
93
+ 4. Chemistry - 22
94
+ 4.1 Organic Chemistry - 25
95
+ 4.2 Inorganic Chemistry - 30
96
+ 4.3 Physical Chemistry - 35
97
+ 5. References - 43
98
+
99
+ The correct output should be:
100
+
101
+ {{
102
+ "Biology": [5, 22],
103
+ "Chemistry": [22, 43]
104
+ }}
105
+
106
+ Now, extract topics from this text: {content}
107
+ """],
108
+ config=types.GenerateContentConfig(temperature=0.)
109
+ )
110
+ return response.text.strip().replace("```json", "").replace("```", "")
inference_svm_model.py CHANGED
@@ -1,31 +1,212 @@
1
  #!/usr/bin/env python3
2
- import cv2
3
- import numpy as np
4
  import os
5
- from joblib import load
 
 
 
 
6
 
 
 
7
 
8
- class SVMModel:
9
- def __init__(self):
10
- path = os.getenv("SVM_MODEL_PATH", "/home/user/app/model_classification/svm_model.joblib")
11
- self.model = load(path)
 
 
 
 
 
 
 
12
 
13
- def classify_image(
14
- self,
15
- image_bytes: bytes,
16
- image_size=(128, 128)
17
- ) -> int:
18
- img = cv2.imdecode(np.frombuffer(image_bytes, np.uint8), cv2.IMREAD_COLOR)
19
- if img is None:
20
- # If image fails to load, default to "irrelevant" or handle differently
21
- return 0
22
-
23
- img = cv2.resize(img, image_size)
24
- x = img.flatten().reshape(1, -1)
25
- pred = self.model.predict(x)[0]
26
- return pred
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  if __name__ == "__main__":
29
- model = load_svm_model("/home/user/app/model_classification/svm_model_2.joblib")
30
- result = classify_image("test.jpg", model)
31
- print("Classification result:", result)
 
 
 
 
 
 
 
 
 
 
 
1
  #!/usr/bin/env python3
 
 
2
  import os
3
+ import re
4
+ import json
5
+ import logging
6
+ import fitz # PyMuPDF
7
+ from typing import Optional, Tuple, Dict, List
8
 
9
+ from contents_extractor_v2 import ContentsExtractor
10
+ from mineru_test_local import LocalPDFProcessor
11
 
12
+ # Configure logging
13
+ logging.basicConfig(
14
+ level=logging.INFO,
15
+ format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
16
+ handlers=[
17
+ logging.StreamHandler(),
18
+ logging.FileHandler('selective_pdf_extractor.log')
19
+ ]
20
+ )
21
+ logger = logging.getLogger(__name__)
22
+ logger.setLevel(logging.INFO)
23
 
24
+ class SelectivePDFProcessor:
25
+ """
26
+ Processes PDF files by extracting only subject content sections.
27
+ First identifies if it's a specification document, then finds the Contents page,
28
+ extracts subject content page ranges, and processes only those pages.
29
+ """
30
+
31
+ def __init__(self, output_folder: str, api_key: str):
32
+ self.output_folder = output_folder
33
+ os.makedirs(self.output_folder, exist_ok=True)
34
+ self.api_key = api_key
35
+ self.contents_extractor = ContentsExtractor(api_key=api_key)
36
+ self.pdf_processor = LocalPDFProcessor(output_folder=output_folder)
37
+
38
+ def check_for_specification(self, pdf_path: str) -> bool:
39
+ """
40
+ Checks if the PDF is a specification document by looking for the word 'specification'
41
+ on the first page.
42
+ """
43
+ try:
44
+ doc = fitz.open(pdf_path)
45
+ first_page_text = doc[0].get_text().lower()
46
+ doc.close()
47
+ return 'specification' in first_page_text
48
+ except Exception as e:
49
+ logger.error(f"Error checking for specification: {e}")
50
+ return False
51
+
52
+ def find_contents_page(self, pdf_path: str) -> Optional[int]:
53
+ """
54
+ Finds the page number of the Contents section.
55
+ """
56
+ try:
57
+ doc = fitz.open(pdf_path)
58
+ # Check first 20 pages for "Contents"
59
+ # (assuming Contents is within the first 20 pages)
60
+ max_pages = min(20, doc.page_count)
61
+
62
+ for page_num in range(max_pages):
63
+ page_text = doc[page_num].get_text()
64
+ # Look for "Contents" as a standalone heading
65
+ if re.search(r'^\s*Contents\s*$', page_text, re.MULTILINE):
66
+ logger.info(f"Found Contents page at page {page_num}")
67
+ doc.close()
68
+ return page_num
69
+
70
+ doc.close()
71
+ logger.warning("Contents page not found")
72
+ return None
73
+ except Exception as e:
74
+ logger.error(f"Error finding contents page: {e}")
75
+ return None
76
+
77
+ def extract_subject_content_pages(self, pdf_path: str, contents_page: int) -> Optional[Tuple[int, int]]:
78
+ """
79
+ Extracts subject content page range using the ContentsExtractor.
80
+ Focuses on "Subject content" section.
81
+ """
82
+ try:
83
+ doc = fitz.open(pdf_path)
84
+ contents_text = doc[contents_page].get_text()
85
+ doc.close()
86
+
87
+ # Use the ContentsExtractor to parse the Contents page
88
+ json_result = self.contents_extractor.extract_contents(contents_text)
89
+ topics_dict = json.loads(json_result)
90
+
91
+ # Look for subject content topics (with variations in naming)
92
+ subject_content_key = None
93
+ for key in topics_dict:
94
+ if 'subject content' in key.lower():
95
+ subject_content_key = key
96
+ break
97
+
98
+ if subject_content_key:
99
+ start_page, end_page = topics_dict[subject_content_key]
100
+ logger.info(f"Found subject content pages: {start_page} to {end_page}")
101
+ return start_page, end_page
102
+ else:
103
+ logger.warning("Subject content section not found in contents")
104
+ return None
105
+ except Exception as e:
106
+ logger.error(f"Error extracting subject content pages: {e}")
107
+ return None
108
+
109
+ def extract_pages_to_new_pdf(self, input_pdf: str, start_page: int, end_page: int) -> str:
110
+ """
111
+ Creates a new PDF containing only the specified page range.
112
+ """
113
+ try:
114
+ doc = fitz.open(input_pdf)
115
+ new_doc = fitz.open()
116
+
117
+ # Convert from page numbers in contents (1-based) to 0-based indices
118
+ start_idx = start_page - 1
119
+ end_idx = end_page - 1
120
+
121
+ # Ensure valid page range
122
+ start_idx = max(0, start_idx)
123
+ end_idx = min(doc.page_count - 1, end_idx)
124
+
125
+ # Copy pages from original to new document
126
+ for page_num in range(start_idx, end_idx + 1):
127
+ new_doc.insert_pdf(doc, from_page=page_num, to_page=page_num)
128
+
129
+ # Save new PDF
130
+ temp_pdf_path = os.path.join(self.output_folder, "subject_content.pdf")
131
+ new_doc.save(temp_pdf_path)
132
+ new_doc.close()
133
+ doc.close()
134
+
135
+ logger.info(f"Created new PDF with pages {start_page} to {end_page} at {temp_pdf_path}")
136
+ return temp_pdf_path
137
+ except Exception as e:
138
+ logger.error(f"Error extracting pages to new PDF: {e}")
139
+ return input_pdf # Return original if extraction fails
140
+
141
+ def process(self, pdf_path: str) -> Optional[str]:
142
+ """
143
+ Main processing function:
144
+ 1. Check if PDF is a specification document
145
+ 2. Find the Contents page
146
+ 3. Extract subject content page range
147
+ 4. Create a new PDF with only those pages
148
+ 5. Process the new PDF using the existing PDF processor
149
+ """
150
+ try:
151
+ # Check if it's a specification document
152
+ is_spec = self.check_for_specification(pdf_path)
153
+ if not is_spec:
154
+ logger.info(f"Not a specification document, processing entire PDF: {pdf_path}")
155
+ return self.pdf_processor.process(pdf_path)
156
+
157
+ # Find the Contents page
158
+ contents_page = self.find_contents_page(pdf_path)
159
+ if contents_page is None:
160
+ logger.warning("Contents page not found, processing entire PDF")
161
+ return self.pdf_processor.process(pdf_path)
162
+
163
+ # Extract subject content page range
164
+ page_range = self.extract_subject_content_pages(pdf_path, contents_page)
165
+ if page_range is None:
166
+ logger.warning("Subject content section not found, processing entire PDF")
167
+ return self.pdf_processor.process(pdf_path)
168
+
169
+ start_page, end_page = page_range
170
+
171
+ # Create new PDF with only subject content pages
172
+ subject_content_pdf = self.extract_pages_to_new_pdf(pdf_path, start_page, end_page)
173
+
174
+ # Process the new PDF
175
+ logger.info(f"Processing subject content PDF: {subject_content_pdf}")
176
+ markdown_result = self.pdf_processor.process(subject_content_pdf)
177
+
178
+ # Add metadata about the extraction
179
+ metadata = (
180
+ f"# Extracted Subject Content\n\n"
181
+ f"Source document: {os.path.basename(pdf_path)}\n"
182
+ f"Pages: {start_page} to {end_page}\n\n"
183
+ f"---\n\n"
184
+ )
185
+
186
+ final_markdown = metadata + markdown_result
187
+
188
+ # Save the final markdown
189
+ final_md_path = os.path.join(self.output_folder, "final_output_with_metadata.md")
190
+ with open(final_md_path, "w", encoding="utf-8") as f:
191
+ f.write(final_markdown)
192
+
193
+ return final_markdown
194
+ except Exception as e:
195
+ logger.error(f"Error in selective processing: {e}")
196
+ # Fallback to processing the entire PDF
197
+ return self.pdf_processor.process(pdf_path)
198
 
199
  if __name__ == "__main__":
200
+ # API key should be stored securely, this is just for demonstration
201
+ GEMINI_API_KEY = "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU" # Same as in the original scripts
202
+
203
+ input_pdf = "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf"
204
+ output_dir = "/home/user/app/input_output/outputs"
205
+
206
+ processor = SelectivePDFProcessor(output_folder=output_dir, api_key=GEMINI_API_KEY)
207
+ result = processor.process(input_pdf)
208
+
209
+ if result:
210
+ logger.info("Processing completed successfully")
211
+ else:
212
+ logger.error("Processing failed")
input_output/output/final_output.md CHANGED
The diff for this file is too large to render. See raw diff
 
input_output/output/images/img_1.png CHANGED
input_output/output/images/img_10.png CHANGED
input_output/output/images/img_10.png_rows/row_0/col_0.png CHANGED
input_output/output/images/img_10.png_rows/row_0/col_2.png ADDED
input_output/output/images/img_11.png CHANGED
input_output/output/images/img_11.png_rows/row_0/col_0.png CHANGED
input_output/output/images/img_11.png_rows/row_0/col_1.png CHANGED
input_output/output/images/img_11.png_rows/row_1/col_0.png CHANGED
input_output/output/images/img_11.png_rows/row_1/col_1.png ADDED
input_output/output/images/img_11.png_rows/row_2/col_0.png ADDED
input_output/output/images/img_11.png_rows/row_3/col_0.png ADDED
input_output/output/images/img_12.png CHANGED
input_output/output/images/img_12.png_rows/row_0/col_0.png CHANGED
input_output/output/images/img_12.png_rows/row_0/col_1.png CHANGED
input_output/output/images/img_12.png_rows/row_1/col_0.png CHANGED
input_output/output/images/img_12.png_rows/row_1/col_1.png CHANGED
input_output/output/images/img_12.png_rows/row_2/col_0.png ADDED
input_output/output/images/img_12.png_rows/row_2/col_1.png ADDED
input_output/output/images/img_12.png_rows/row_3/col_0.png ADDED
input_output/output/images/img_13.png CHANGED
input_output/output/images/img_13.png_rows/row_0/col_0.png CHANGED
input_output/output/images/img_13.png_rows/row_0/col_1.png CHANGED
input_output/output/images/img_13.png_rows/row_1/col_0.png CHANGED
input_output/output/images/img_13.png_rows/row_1/col_1.png CHANGED
input_output/output/images/img_13.png_rows/row_2/col_0.png CHANGED
input_output/output/images/img_13.png_rows/row_3/col_0.png ADDED
input_output/output/images/img_13.png_rows/row_3/col_1.png ADDED
input_output/output/images/img_14.png CHANGED
input_output/output/images/img_14.png_rows/row_0/col_0.png CHANGED
input_output/output/images/img_14.png_rows/row_0/col_1.png CHANGED
input_output/output/images/img_14.png_rows/row_1/col_0.png CHANGED
input_output/output/images/img_14.png_rows/row_1/col_1.png CHANGED
input_output/output/images/img_14.png_rows/row_2/col_0.png ADDED
input_output/output/images/img_14.png_rows/row_3/col_0.png ADDED
input_output/output/images/img_14.png_rows/row_4/col_0.png ADDED
input_output/output/images/img_14.png_rows/row_5/col_0.png ADDED
input_output/output/images/img_15.png CHANGED
input_output/output/images/img_15.png_rows/row_0/col_0.png CHANGED
input_output/output/images/img_15.png_rows/row_0/col_1.png CHANGED
input_output/output/images/img_15.png_rows/row_1/col_0.png CHANGED
input_output/output/images/img_15.png_rows/row_1/col_1.png ADDED