SkyNait commited on
Commit
6fc2b3e
·
1 Parent(s): 5e4a36e

correct RabbitMQ

Browse files
__pycache__/table_row_extraction.cpython-310.pyc CHANGED
Binary files a/__pycache__/table_row_extraction.cpython-310.pyc and b/__pycache__/table_row_extraction.cpython-310.pyc differ
 
__pycache__/topic_extr.cpython-310.pyc ADDED
Binary file (7.56 kB). View file
 
__pycache__/worker.cpython-310.pyc CHANGED
Binary files a/__pycache__/worker.cpython-310.pyc and b/__pycache__/worker.cpython-310.pyc differ
 
input_output/168982-specification-gcse-mathematics.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf3ff38c2035447e51b0d6dc8df35eeb1d5cc9296d77f310d71fe1d39c66062a
3
+ size 13646315
page_range.py DELETED
@@ -1,300 +0,0 @@
1
- #!/usr/bin/env python3
2
- import os
3
- import re
4
- import json
5
- import logging
6
- import fitz
7
- import requests
8
- import time
9
- from statistics import mode, median
10
- from typing import Dict, List, Tuple
11
-
12
- from google import genai
13
- from google.genai import types
14
-
15
- logging.basicConfig(level=logging.INFO)
16
- logger = logging.getLogger(__name__)
17
-
18
- def find_all_occurrences(pdf_bytes: bytes, search_text: str) -> list:
19
- doc = fitz.open(stream=pdf_bytes, filetype="pdf")
20
- st_norm = re.sub(r"\s+", " ", search_text).strip()
21
- found = []
22
- for i in range(doc.page_count):
23
- raw = doc[i].get_text("raw")
24
- norm = re.sub(r"\s+", " ", raw).strip()
25
- if st_norm in norm:
26
- found.append(i)
27
- doc.close()
28
- return sorted(found)
29
-
30
- class GeminiTopicExtractor:
31
- def __init__(self, api_key: str = None, num_pages: int = 20):
32
- self.api_key = api_key or os.getenv("GEMINI_API_KEY", "")
33
- self.num_pages = num_pages
34
-
35
- def _read_first_pages_raw(self, pdf_path: str, num_pages: int) -> str:
36
- text_parts = []
37
- try:
38
- if pdf_path.startswith("http://") or pdf_path.startswith("https://"):
39
- response = requests.get(pdf_path)
40
- if response.status_code != 200:
41
- logger.error("Failed to download PDF from %s. Status code: %d", pdf_path, response.status_code)
42
- return ""
43
- pdf_bytes = response.content
44
- else:
45
- with open(pdf_path, "rb") as f:
46
- pdf_bytes = f.read()
47
- doc = fitz.open(stream=pdf_bytes, filetype="pdf")
48
- pages_to_read = min(num_pages, doc.page_count)
49
- for i in range(pages_to_read):
50
- raw_text = doc[i].get_text("raw")
51
- text_parts.append(raw_text)
52
- doc.close()
53
- except Exception as e:
54
- logger.error(f"Could not open PDF: {e}")
55
- return "\n".join(text_parts)
56
-
57
- def extract_subtopics(self, pdf_path: str) -> dict:
58
- first_pages_text = self._read_first_pages_raw(pdf_path, self.num_pages)
59
- if not first_pages_text.strip():
60
- logger.error("No text from first pages => cannot extract subtopics.")
61
- return {}
62
- prompt = f"""
63
- You have the first pages of a PDF specification, including a table of contents.
64
- Instructions:
65
- 1. Identify the 'Contents' section listing all topics, subtopics, and their corresponding pages.
66
- 2. Identify the major academic subtopics (common desired topic names "Paper X", "Theme X", "Content of X", "AS Unit X", "A2 Unit X", or similar headings).
67
- 3. For each subtopic, give the range of pages [start_page, end_page] (1-based) from the table of contents.
68
- 4. Output only valid JSON of the form:
69
- {{
70
- "Subtopic A": [start_page, end_page],
71
- "Subtopic B": [start_page, end_page]
72
- }}
73
- 5. If you can't find any subtopics, return an empty JSON.
74
- Important notes:
75
- - The correct "end_page" must be the page number of the next topic or subtopic minus 1.
76
- - The final output must be valid JSON only, with no extra text or code blocks.
77
-
78
- Examples:
79
- 1. Given this table of contents:
80
- 1 Introduction – 2
81
- Why choose Edexcel A Level Mathematics? - 2
82
- Supporting you in planning and implementing this qualification - 3
83
- Qualification at a glance - 5
84
- 2 Subject content and assessment information – 7
85
- Paper 1 and Paper 2: Pure Mathematics - 11
86
- Paper 3: Statistics and Mechanics - 30
87
- Assessment Objectives - 40
88
- 3 Administration and general information – 42
89
- Entries - 42
90
- Access arrangements, reasonable adjustments, special consideration and malpractice - 42
91
- Student recruitment and progression - 45
92
-
93
- The correct output should be:
94
- {{
95
- "Paper 1 and Paper 2: Pure Mathematics": [11, 29],
96
- "Paper 3: Statistics and Mechanics": [30, 38]
97
- }}
98
- 2. Given this table of contents:
99
- Qualification at a glance – 1
100
- Assessment Objectives and weightings - 4
101
- Knowledge, skills and understanding – 5
102
- Theme 1: Introduction to markets and market failure - 5
103
- Theme 2: The UK economy – performance and policies - 11
104
- Theme 3: Business behaviour and the labour market - 21
105
- Theme 4: A global perspective - 29
106
- Assessment – 39
107
- Assessment summary - 39
108
- Assessment objectives - 41
109
- Assessment overview - 42
110
- Breakdown of assessment objectives - 42
111
-
112
- The correct output should be:
113
- {{
114
- "Theme 1: Introduction to markets and market failure": [5, 10],
115
- "Theme 2: The UK economy – performance and policies": [11, 20],
116
- "Theme 3: Business behaviour and the labour market": [21, 28],
117
- "Theme 4: A global perspective": [29, 38]
118
- }}
119
-
120
- 3. You might also see sections like:
121
- 2.1 AS Unit 1 11
122
- 2.2 AS Unit 2 18
123
- 2.3 A2 Unit 3 24
124
- 2.4 A2 Unit 4 31
125
- In that scenario, your output might look like:
126
- {{
127
- "2.1 AS Unit 1": [11, 17],
128
- "2.2 AS Unit 2": [18, 23],
129
- "2.3 A2 Unit 3": [24, 30],
130
- "2.4 A2 Unit 4": [31, 35]
131
- }}
132
- or
133
- 2.1 AS units 6
134
- 2.2 AS units 23
135
- In that scenario, your output might look like:
136
- {{
137
- "2.1 AS Unit 1": [6, 2],
138
- "2.2 AS Unit 2": [23, 43]
139
- }}
140
-
141
- 4. Another example might list subtopics:
142
- 3.1 Overarching themes 11
143
- 3.2 A: Proof 12
144
- 3.3 B: Algebra and functions 13
145
- 3.4 C: Coordinate geometry in the ( x , y ) plane 14
146
- 3.5 D: Sequences and series 15
147
- 3.6 E: Trigonometry 16
148
- 3.7 F: Exponentials and logarithms 17
149
- 3.8 G: Differentiation 18
150
- 3.9 H: Integration 19
151
- 3.10 I: Numerical methods 20
152
- 3.11 J: Vectors 20
153
- 3.12 K: Statistical sampling 21
154
- 3.13 L: Data presentation and interpretation 21
155
- 3.14 M: Probability 22
156
- 3.15 N: Statistical distributions 23
157
- 3.16 O: Statistical hypothesis testing 23
158
- 3.17 P: Quantities and units in mechanics 24
159
- 3.18 Q: Kinematics 24
160
- 3.19 R: Forces and Newton’s laws 24
161
- 3.20 S: Moments 25
162
- 3.21 Use of data in statistics 26
163
-
164
- Here the correct output might look like:
165
- {{
166
- "A: Proof": [12, 12],
167
- "B: Algebra and functions": [13, 13],
168
- ...
169
- }}
170
- Now, extract topics from this text:
171
- {first_pages_text}
172
- """
173
- global _GEMINI_CLIENT
174
- if '_GEMINI_CLIENT' not in globals() or _GEMINI_CLIENT is None:
175
- _GEMINI_CLIENT = genai.Client(api_key=self.api_key)
176
- client = _GEMINI_CLIENT
177
- try:
178
- response = client.models.generate_content(
179
- model="gemini-2.0-flash",
180
- contents=[prompt],
181
- config=types.GenerateContentConfig(temperature=0.0)
182
- )
183
- if not response or not response.text:
184
- logger.warning("No text from LLM => returning empty subtopics.")
185
- return {}
186
- raw_json = response.text.strip()
187
- cleaned = raw_json.replace("```json", "").replace("```", "")
188
- try:
189
- data = json.loads(cleaned)
190
- except Exception as json_err:
191
- logger.error(f"JSON parsing error: {json_err}")
192
- return {}
193
- final_dict = {}
194
- found_sub_dict = None
195
- for k, v in data.items():
196
- if isinstance(v, dict):
197
- found_sub_dict = v
198
- break
199
- if found_sub_dict is not None:
200
- for subk, rng in found_sub_dict.items():
201
- if isinstance(rng, list) and len(rng) == 2:
202
- final_dict[subk] = rng
203
- else:
204
- for subk, rng in data.items():
205
- if isinstance(rng, list) and len(rng) == 2:
206
- final_dict[subk] = rng
207
- return final_dict
208
- except Exception as e:
209
- logger.error(f"Gemini subtopic extraction error: {e}")
210
- return {}
211
-
212
- class TopicRangeExtractor:
213
- def __init__(self, gemini_api_key: str):
214
- self.gemini_api_key = gemini_api_key
215
- self.subtopic_extractor = GeminiTopicExtractor(api_key=gemini_api_key, num_pages=20)
216
-
217
- def process(self, pdf_path: str) -> dict:
218
- logger.info(f"Processing PDF: {pdf_path}")
219
- subtopics = self.subtopic_extractor.extract_subtopics(pdf_path)
220
- logger.info(f"Gemini returned subtopics: {subtopics}")
221
-
222
- if pdf_path.startswith("http://") or pdf_path.startswith("https://"):
223
- response = requests.get(pdf_path)
224
- if response.status_code != 200:
225
- logger.error("Failed to download PDF from %s. Status code: %d", pdf_path, response.status_code)
226
- raise Exception(f"Failed to download PDF: {pdf_path}")
227
- pdf_bytes = response.content
228
- logger.info("Downloaded %d bytes for pdf_url='%s'", len(pdf_bytes), pdf_path)
229
- else:
230
- with open(pdf_path, "rb") as f:
231
- pdf_bytes = f.read()
232
- logger.info("Loaded %d bytes from local file '%s'", len(pdf_bytes), pdf_path)
233
-
234
- doc = fitz.open(stream=pdf_bytes, filetype="pdf")
235
- total_pages = doc.page_count
236
- doc.close()
237
-
238
- if not subtopics:
239
- return {"page_range": list(range(total_pages))}
240
-
241
- offset_candidates = []
242
- subtopics_corrected = {}
243
- for subname, rng in subtopics.items():
244
- if not (isinstance(rng, list) and len(rng) == 2):
245
- continue
246
- start_p, end_p = rng
247
- occs = find_all_occurrences(pdf_bytes, subname)
248
- for p in occs:
249
- candidate = p - (start_p - 1)
250
- if candidate > 0:
251
- offset_candidates.append(candidate)
252
-
253
- subtopics_corrected[subname] = rng
254
-
255
- if offset_candidates:
256
- try:
257
- global_offset = mode(offset_candidates)
258
- except Exception:
259
- global_offset = int(median(offset_candidates))
260
- else:
261
- global_offset = 0
262
- logger.info(f"Computed global offset: {global_offset}")
263
-
264
- adjusted_subtopics = []
265
- for subname, rng in subtopics_corrected.items():
266
- start_p, end_p = rng
267
- s0 = (start_p) + global_offset
268
- e0 = (end_p - 1) + global_offset
269
- adjusted_subtopics.append((subname, (s0, e0)))
270
-
271
- sorted_subtopics = sorted(adjusted_subtopics, key=lambda x: x[1][0])
272
- final_subtopics = []
273
- for i in range(len(sorted_subtopics)):
274
- subname, (s0, e0) = sorted_subtopics[i]
275
- if i < len(sorted_subtopics) - 1:
276
- next_s0 = sorted_subtopics[i + 1][1][0]
277
- new_e0 = min(e0, next_s0 - 1)
278
- else:
279
- new_e0 = min(e0, total_pages - 1)
280
- final_subtopics.append((subname, (s0, new_e0)))
281
-
282
- real_pages_set = set()
283
- for subname, (s0, e0) in final_subtopics:
284
- for pp in range(s0, e0 + 1):
285
- if 0 <= pp < total_pages:
286
- real_pages_set.add(pp)
287
-
288
- page_range = sorted(real_pages_set)
289
- logger.info(f"Final page range: {page_range}")
290
- return {"page_range": page_range}
291
-
292
- if __name__ == "__main__":
293
- input_pdf = "/home/user/app/input_output/pearson-A_Level_Economics.pdf"
294
- gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
295
- try:
296
- extractor = TopicRangeExtractor(gemini_api_key=gemini_key)
297
- result = extractor.process(input_pdf)
298
- # print(json.dumps(result, indent=2))
299
- except Exception as e:
300
- logger.error(f"Processing failed: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
table_row_extraction.py DELETED
@@ -1,441 +0,0 @@
1
- import cv2
2
- import numpy as np
3
- import math
4
- import logging
5
- from pathlib import Path
6
- from typing import List, Tuple
7
-
8
- logging.basicConfig(level=logging.INFO)
9
- logger = logging.getLogger(__name__)
10
-
11
- # if you are working with 3-column tables, change `merge_two_col_rows` and `enable_subtopic_merge` to False
12
- # otherwise set them to True if you are working with 2-column tables (currently hardcoded, just test)
13
-
14
-
15
- def color_distance(c1: Tuple[float, float, float],
16
- c2: Tuple[float, float, float]) -> float:
17
- """
18
- Euclidean distance between two BGR colors c1 and c2.
19
- """
20
- return math.sqrt((c1[0] - c2[0])**2 + (c1[1] - c2[1])**2 + (c1[2] - c2[2])**2)
21
-
22
- def average_bgr(cell_img: np.ndarray) -> Tuple[float, float, float]:
23
- """
24
- Return the average BGR color of the entire cell_img.
25
- """
26
- b_mean = np.mean(cell_img[:, :, 0])
27
- g_mean = np.mean(cell_img[:, :, 1])
28
- r_mean = np.mean(cell_img[:, :, 2])
29
- return (b_mean, g_mean, r_mean)
30
-
31
- class TableExtractor:
32
- def __init__(
33
- self,
34
- # --- Preprocessing ---
35
- denoise_h: int = 10,
36
- clahe_clip: float = 3.0,
37
- clahe_grid: int = 8,
38
- sharpen_kernel: np.ndarray = np.array([[-1, -1, -1],
39
- [-1, 9, -1],
40
- [-1, -1, -1]]),
41
- thresh_block_size: int = 21,
42
- thresh_C: int = 7,
43
-
44
- # --- Row detection ---
45
- horizontal_scale: int = 20,
46
- row_morph_iterations: int = 1,
47
- min_row_height: int = 15,
48
- min_row_density: float = 0.01,
49
-
50
- # Additional row detection parameters
51
- faint_line_threshold_factor: float = 0.1,
52
- top_line_grouping_px: int = 8,
53
- some_minimum_text_pixels: int = 50,
54
-
55
- # --- Column detection ---
56
- vertical_scale: int = 20,
57
- col_morph_iterations: int = 2,
58
- min_col_height_ratio: float = 0.5,
59
- min_col_density: float = 0.01,
60
-
61
- # --- Bbox extraction ---
62
- padding: int = 0,
63
- skip_header: bool = True,
64
-
65
- # --- Two-column & subtopic merges ---
66
- merge_two_col_rows: bool = True,
67
- enable_subtopic_merge: bool = True,
68
- subtopic_threshold: float = 0.2,
69
-
70
- # --- Color-based artifact filter ---
71
- artifact_color_a6: Tuple[int, int, int] = (166, 166, 166),
72
- artifact_color_a7: Tuple[int, int, int] = (180, 180, 180),
73
- artifact_color_a8: Tuple[int, int, int] = (80, 48, 0),
74
- artifact_color_a9: Tuple[int, int, int] = (223, 153, 180),
75
- artifact_color_a10: Tuple[int, int, int] = (0, 0, 0),
76
- color_tolerance: float = 30.0
77
- ):
78
- # Preprocessing
79
- self.denoise_h = denoise_h
80
- self.clahe_clip = clahe_clip
81
- self.clahe_grid = clahe_grid
82
- self.sharpen_kernel = sharpen_kernel
83
- self.thresh_block_size = thresh_block_size
84
- self.thresh_C = thresh_C
85
-
86
- # Row detection
87
- self.horizontal_scale = horizontal_scale
88
- self.row_morph_iterations = row_morph_iterations
89
- self.min_row_height = min_row_height
90
- self.min_row_density = min_row_density
91
-
92
- # Additional row detection
93
- self.faint_line_threshold_factor = faint_line_threshold_factor
94
- self.top_line_grouping_px = top_line_grouping_px
95
- self.some_minimum_text_pixels = some_minimum_text_pixels
96
-
97
- # Column detection
98
- self.vertical_scale = vertical_scale
99
- self.col_morph_iterations = col_morph_iterations
100
- self.min_col_height_ratio = min_col_height_ratio
101
- self.min_col_density = min_col_density
102
-
103
- # Bbox extraction
104
- self.padding = padding
105
- self.skip_header = skip_header
106
-
107
- # Two-column & subtopic merges
108
- self.merge_two_col_rows = merge_two_col_rows
109
- self.enable_subtopic_merge = enable_subtopic_merge
110
- self.subtopic_threshold = subtopic_threshold
111
-
112
- # Color-based artifact filter
113
- self.artifact_color_a6 = artifact_color_a6
114
- self.artifact_color_a7 = artifact_color_a7
115
- self.artifact_color_a8 = artifact_color_a8
116
- self.artifact_color_a9 = artifact_color_a9
117
- self.artifact_color_a10 = artifact_color_a10
118
- self.color_tolerance = color_tolerance
119
-
120
- def preprocess(self, img: np.ndarray) -> np.ndarray:
121
- """
122
- Grayscale, denoise, CLAHE, sharpen, then adaptive threshold (binary_inv).
123
- """
124
- if img.ndim == 3:
125
- gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
126
- else:
127
- gray = img.copy()
128
-
129
- denoised = cv2.fastNlMeansDenoising(gray, h=self.denoise_h)
130
- clahe = cv2.createCLAHE(clipLimit=self.clahe_clip,
131
- tileGridSize=(self.clahe_grid, self.clahe_grid))
132
- enhanced = clahe.apply(denoised)
133
- sharpened = cv2.filter2D(enhanced, -1, self.sharpen_kernel)
134
-
135
- binarized = cv2.adaptiveThreshold(
136
- sharpened, 255,
137
- cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
138
- cv2.THRESH_BINARY_INV,
139
- self.thresh_block_size,
140
- self.thresh_C
141
- )
142
- return binarized
143
-
144
- def detect_full_rows(self, bin_img: np.ndarray) -> List[Tuple[int, int]]:
145
- h_kernel_size = max(1, bin_img.shape[1] // self.horizontal_scale)
146
- horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (h_kernel_size, 1))
147
- horizontal_lines = cv2.morphologyEx(
148
- bin_img, cv2.MORPH_OPEN, horizontal_kernel,
149
- iterations=self.row_morph_iterations
150
- )
151
-
152
- row_projection = np.sum(horizontal_lines, axis=1)
153
- max_val = np.max(row_projection) if len(row_projection) else 0
154
-
155
- if max_val < 1e-5:
156
- return [(0, bin_img.shape[0])]
157
-
158
- threshold_val = self.faint_line_threshold_factor * max_val
159
- line_indices = np.where(row_projection > threshold_val)[0]
160
- if len(line_indices) < 2:
161
- return [(0, bin_img.shape[0])]
162
-
163
- lines = []
164
- group = [line_indices[0]]
165
- for i in range(1, len(line_indices)):
166
- if (line_indices[i] - line_indices[i - 1]) <= self.top_line_grouping_px:
167
- group.append(line_indices[i])
168
- else:
169
- lines.append(int(np.mean(group)))
170
- group = [line_indices[i]]
171
- if group:
172
- lines.append(int(np.mean(group)))
173
-
174
- potential_bounds = []
175
- for i in range(len(lines) - 1):
176
- y1 = lines[i]
177
- y2 = lines[i + 1]
178
- if (y2 - y1) > 0:
179
- potential_bounds.append((y1, y2))
180
-
181
- if potential_bounds:
182
- if potential_bounds[0][0] > 0:
183
- potential_bounds.insert(0, (0, potential_bounds[0][0]))
184
- if potential_bounds[-1][1] < bin_img.shape[0]:
185
- potential_bounds.append((potential_bounds[-1][1], bin_img.shape[0]))
186
- else:
187
- potential_bounds = [(0, bin_img.shape[0])]
188
-
189
- final_rows = []
190
- for (y1, y2) in potential_bounds:
191
- height = (y2 - y1)
192
- region = bin_img[y1:y2, :]
193
- white_count = np.sum(region == 255)
194
-
195
- if height < self.min_row_height:
196
- if white_count >= self.some_minimum_text_pixels:
197
- final_rows.append((y1, y2))
198
- else:
199
- final_rows.append((y1, y2))
200
-
201
- final_rows = sorted(final_rows, key=lambda x: x[0])
202
- return final_rows if final_rows else [(0, bin_img.shape[0])]
203
-
204
- def detect_columns_in_row(self,
205
- row_img: np.ndarray,
206
- y1: int,
207
- y2: int) -> List[Tuple[int, int, int, int]]:
208
- row_height = (y2 - y1)
209
- row_width = row_img.shape[1]
210
-
211
- v_kernel_size = max(1, row_height // self.vertical_scale)
212
- vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, v_kernel_size))
213
-
214
- vertical_lines = cv2.morphologyEx(
215
- row_img, cv2.MORPH_OPEN, vertical_kernel,
216
- iterations=self.col_morph_iterations
217
- )
218
- vertical_lines = cv2.dilate(vertical_lines,
219
- np.ones((3, 3), np.uint8),
220
- iterations=1)
221
-
222
- # Find contours => x positions
223
- contours, _ = cv2.findContours(vertical_lines,
224
- cv2.RETR_EXTERNAL,
225
- cv2.CHAIN_APPROX_SIMPLE)
226
- x_positions = []
227
- for c in contours:
228
- x, _, w, h = cv2.boundingRect(c)
229
- # Must be at least half the row height to be a real divider
230
- if h >= self.min_col_height_ratio * row_height:
231
- x_positions.append(x)
232
-
233
- x_positions = sorted(set(x_positions))
234
- # Keep at most 2 vertical lines
235
- if len(x_positions) > 2:
236
- x_positions = x_positions[:2]
237
-
238
- # Build bounding boxes
239
- if len(x_positions) == 0:
240
- # 0 lines => single bounding box
241
- boxes = [(0, y1, row_width, row_height)]
242
-
243
- elif len(x_positions) == 1:
244
- # 1 line => 2 bounding boxes by default
245
- x1 = x_positions[0]
246
- if self.merge_two_col_rows:
247
- # Merge => single bounding box
248
- boxes = [(0, y1, row_width, row_height)]
249
- else:
250
- boxes = [
251
- (0, y1, x1, row_height),
252
- (x1, y1, row_width - x1, row_height)
253
- ]
254
- else:
255
- # 2 lines => normally 3 bounding boxes
256
- x1, x2 = sorted(x_positions)
257
- if self.enable_subtopic_merge:
258
- # If left bounding box is very narrow => treat as subtopic => 2 boxes
259
- if x1 < (self.subtopic_threshold * row_width):
260
- boxes = [
261
- (0, y1, x1, row_height),
262
- (x1, y1, row_width - x1, row_height)
263
- ]
264
- else:
265
- boxes = [
266
- (0, y1, x1, row_height),
267
- (x1, y1, x2 - x1, row_height),
268
- (x2, y1, row_width - x2, row_height)
269
- ]
270
- else:
271
- boxes = [
272
- (0, y1, x1, row_height),
273
- (x1, y1, x2 - x1, row_height),
274
- (x2, y1, row_width - x2, row_height)
275
- ]
276
-
277
- # Filter out columns with insufficient density
278
- filtered = []
279
- for (x, y, w, h) in boxes:
280
- if w <= 0:
281
- continue
282
- subregion = row_img[:, x:x+w]
283
- white_pixels = np.sum(subregion == 255)
284
- total_pixels = subregion.size
285
- if total_pixels == 0:
286
- continue
287
- density = white_pixels / float(total_pixels)
288
- if density >= self.min_col_density:
289
- filtered.append((x, y, w, h))
290
-
291
- return filtered
292
-
293
- def process_image(self, image_path: str) -> List[List[Tuple[int, int, int, int]]]:
294
- """
295
- 1) Preprocess => bin_img
296
- 2) Detect row segments (with faint-line logic)
297
- 3) Filter out rows by density
298
- 4) Optionally skip the first row (header)
299
- 5) For each row => detect columns => bounding boxes
300
- """
301
- img = cv2.imread(image_path)
302
- if img is None:
303
- raise ValueError(f"Could not read image: {image_path}")
304
-
305
- bin_img = self.preprocess(img)
306
- row_segments = self.detect_full_rows(bin_img)
307
-
308
- # Filter out rows with insufficient density
309
- valid_rows = []
310
- for (y1, y2) in row_segments:
311
- row_region = bin_img[y1:y2, :]
312
- area = row_region.size
313
- if area == 0:
314
- continue
315
- white_pixels = np.sum(row_region == 255)
316
- density = white_pixels / float(area)
317
- if density >= self.min_row_density:
318
- valid_rows.append((y1, y2))
319
-
320
- # skip header row
321
- if self.skip_header and len(valid_rows) > 1:
322
- valid_rows = valid_rows[1:]
323
-
324
- # Detect columns in each valid row
325
- all_rows_boxes = []
326
- for (y1, y2) in valid_rows:
327
- row_img = bin_img[y1:y2, :]
328
- col_boxes = self.detect_columns_in_row(row_img, y1, y2)
329
- if col_boxes:
330
- all_rows_boxes.append(col_boxes)
331
-
332
- return all_rows_boxes
333
-
334
- def extract_box_image(self,
335
- original: np.ndarray,
336
- box: Tuple[int, int, int, int]) -> np.ndarray:
337
- """
338
- Crop bounding box from original with optional padding.
339
- """
340
- x, y, w, h = box
341
- Y1 = max(0, y - self.padding)
342
- Y2 = min(original.shape[0], y + h + self.padding)
343
- X1 = max(0, x - self.padding)
344
- X2 = min(original.shape[1], x + w + self.padding)
345
- return original[Y1:Y2, X1:X2]
346
-
347
- def is_artifact_by_color(self, cell_img: np.ndarray) -> bool:
348
- """
349
- Revert to the *exact* color-based artifact logic from the first script:
350
- 1) If the average color is near #a6a6a6 or #a7a7a7 (within color_tolerance),
351
- skip it. Otherwise, keep it.
352
- """
353
- if cell_img.size == 0:
354
- return True
355
-
356
- avg_col = average_bgr(cell_img)
357
- dist_a6 = color_distance(avg_col, self.artifact_color_a6)
358
- if dist_a6 < self.color_tolerance:
359
- return True
360
-
361
- dist_a7 = color_distance(avg_col, self.artifact_color_a7)
362
- if dist_a7 < self.color_tolerance:
363
- return True
364
-
365
- dist_a8 = color_distance(avg_col, self.artifact_color_a8)
366
- if dist_a8 < self.color_tolerance:
367
- return True
368
-
369
- dist_a9 = color_distance(avg_col, self.artifact_color_a9)
370
- if dist_a9 < self.color_tolerance:
371
- return True
372
-
373
- dist_a10 = color_distance(avg_col, self.artifact_color_a10)
374
- if dist_a10 < self.color_tolerance:
375
- return True
376
-
377
- return False
378
-
379
- def save_extracted_cells(
380
- self,
381
- image_path: str,
382
- row_boxes: List[List[Tuple[int, int, int, int]]],
383
- output_dir: str
384
- ):
385
- """
386
- Save each cell from the original image, skipping if it's near #a6a6a6 or #a7a7a7.
387
- """
388
- out_path = Path(output_dir)
389
- out_path.mkdir(exist_ok=True, parents=True)
390
-
391
- original = cv2.imread(image_path)
392
- if original is None:
393
- raise ValueError(f"Could not read original image: {image_path}")
394
-
395
- for i, row in enumerate(row_boxes):
396
- row_dir = out_path / f"row_{i}"
397
- row_dir.mkdir(exist_ok=True)
398
- for j, box in enumerate(row):
399
- cell_img = self.extract_box_image(original, box)
400
-
401
- # Check color-based artifact
402
- if self.is_artifact_by_color(cell_img):
403
- logger.info(f"Skipping artifact cell at row={i}, col={j} (color near #a6a6a6/#a7a7a7).")
404
- continue
405
-
406
- out_file = row_dir / f"col_{j}.png"
407
- cv2.imwrite(str(out_file), cell_img)
408
- logger.info(f"Saved cell row={i}, col={j} -> {out_file}")
409
-
410
- class TableExtractorApp:
411
- def __init__(self, extractor: TableExtractor):
412
- self.extractor = extractor
413
-
414
- def run(self, input_image: str, output_folder: str):
415
- row_boxes = self.extractor.process_image(input_image)
416
- logger.info(f"Detected {len(row_boxes)} row(s).")
417
- self.extractor.save_extracted_cells(input_image, row_boxes, output_folder)
418
- logger.info("Done. Check the output folder for results.")
419
-
420
- if __name__ == "__main__":
421
- input_image = "images/test/img_9.png"
422
- output_folder = "combined_outputs"
423
-
424
- extractor = TableExtractor(
425
- row_morph_iterations=1,
426
- min_row_height=15,
427
- skip_header=False,
428
-
429
- merge_two_col_rows=True,
430
- enable_subtopic_merge=True,
431
- subtopic_threshold=0.2,
432
-
433
- faint_line_threshold_factor=0.4,
434
- top_line_grouping_px=12,
435
- some_minimum_text_pixels=50,
436
-
437
- color_tolerance=30.0
438
- )
439
-
440
- app = TableExtractorApp(extractor)
441
- app.run(input_image, output_folder)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
topic_extr.py CHANGED
@@ -1,6 +1,5 @@
1
  #!/usr/bin/env python3
2
  import os
3
- import sys
4
  import json
5
  import logging
6
  import gc
@@ -8,58 +7,22 @@ import fitz
8
  import requests
9
  import torch
10
  import boto3
11
- import re
 
12
 
13
  from magic_pdf.data.dataset import PymuDocDataset
14
  from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
15
 
16
- logging.basicConfig(level=logging.INFO)
 
 
 
 
 
 
 
17
  logger = logging.getLogger(__name__)
18
 
19
- def create_subset_pdf(original_pdf_bytes: bytes, page_indices: list) -> bytes:
20
- if not page_indices:
21
- raise ValueError("No page indices provided for subset creation.")
22
- doc = fitz.open(stream=original_pdf_bytes, filetype="pdf")
23
- new_doc = fitz.open()
24
- for p in sorted(set(page_indices)):
25
- if 0 <= p < doc.page_count:
26
- new_doc.insert_pdf(doc, from_page=p, to_page=p)
27
- else:
28
- logger.error(f"Page index {p} out of range (0..{doc.page_count - 1}).")
29
- raise ValueError(f"Page index {p} out of range.")
30
- subset_bytes = new_doc.tobytes()
31
- new_doc.close()
32
- doc.close()
33
- return subset_bytes
34
-
35
- def parse_page_range(page_field) -> list:
36
- """
37
- Parse the 'page' field from the JSON input.
38
- It can be either:
39
- • a list of integers:
40
- - If the list contains exactly two integers, treat them as a range [start, end] (inclusive start, exclusive end).
41
- - Otherwise, treat the list as a sequence of individual pages.
42
- • a string:
43
- - Either a comma-separated range "start, end" or a comma-separated list of pages.
44
- The numbers are assumed to be 1-indexed and are converted to 0-indexed.
45
- """
46
- if isinstance(page_field, list):
47
- if len(page_field) == 2:
48
- start, end = page_field
49
- return list(range(start - 1, end))
50
- else:
51
- return [int(p) - 1 for p in page_field]
52
- elif isinstance(page_field, str):
53
- parts = [p.strip() for p in page_field.split(',')]
54
- if len(parts) == 2:
55
- start, end = int(parts[0]), int(parts[1])
56
- return list(range(start - 1, end))
57
- else:
58
- return [int(p) - 1 for p in parts]
59
- else:
60
- logger.error("Invalid type for page field. Must be list or string.")
61
- raise ValueError("Invalid page field type.")
62
-
63
  class s3Writer:
64
  def __init__(self, ak: str, sk: str, bucket: str, endpoint_url: str):
65
  self.bucket = bucket
@@ -72,7 +35,6 @@ class s3Writer:
72
 
73
  def write(self, path: str, data: bytes) -> None:
74
  try:
75
- from io import BytesIO
76
  file_obj = BytesIO(data)
77
  self.client.upload_fileobj(file_obj, self.bucket, path)
78
  logger.info(f"Uploaded to S3: {path}")
@@ -101,21 +63,42 @@ class S3ImageWriter:
101
  md_content = md_content.replace(f"![]({key}{path})", f"![]({s3_path})")
102
  return md_content
103
 
 
 
 
 
 
 
 
 
104
  class TopicExtractionProcessor:
105
- def __init__(self, gemini_api_key: str, s3_config: dict, output_folder: str):
106
- self.gemini_api_key = gemini_api_key
107
- self.output_folder = output_folder
108
- os.makedirs(self.output_folder, exist_ok=True)
109
- self.layout_model = "doclayout_yolo"
110
- self.formula_enable = True
111
- self.table_enable = False
112
- self.language = "en"
113
- self.s3_writer = s3Writer(
114
- ak=os.getenv("S3_ACCESS_KEY"),
115
- sk=os.getenv("S3_SECRET_KEY"),
116
- bucket="quextro-resources",
117
- endpoint_url=os.getenv("S3_ENDPOINT")
118
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
  def cleanup_gpu(self):
121
  try:
@@ -123,105 +106,100 @@ class TopicExtractionProcessor:
123
  torch.cuda.empty_cache()
124
  logger.info("GPU memory cleaned up.")
125
  except Exception as e:
126
- logger.error(f"Error during GPU cleanup: {e}")
127
-
128
- def process_input_file(self, input_file: dict) -> str:
129
- key = input_file.get("key", "")
130
- url = input_file.get("url", "")
131
- page_field = input_file.get("page")
132
- if not url or not page_field:
133
- logger.error("Input file must contain 'url' and 'page' fields.")
134
- raise ValueError("Missing 'url' or 'page' in input file.")
135
-
136
- page_indices = parse_page_range(page_field)
137
- logger.info("Using page indices (0-indexed): %s", page_indices)
138
-
139
- # Retrieve PDF bytes (supports URL or local file)
140
- if url.startswith("http://") or url.startswith("https://"):
141
- response = requests.get(url)
142
- if response.status_code != 200:
143
- logger.error("Failed to download PDF from %s. Status code: %d", url, response.status_code)
144
- raise Exception(f"Failed to download PDF: {url}")
145
- pdf_bytes = response.content
146
- else:
147
- with open(url, "rb") as f:
148
- pdf_bytes = f.read()
149
-
150
- subset_pdf_bytes = create_subset_pdf(pdf_bytes, page_indices)
151
- logger.info("Created subset PDF with %d pages", len(page_indices))
152
-
153
- dataset = PymuDocDataset(subset_pdf_bytes)
154
- inference = doc_analyze(
155
- dataset,
156
- ocr=True,
157
- lang=self.language,
158
- layout_model=self.layout_model,
159
- formula_enable=self.formula_enable,
160
- table_enable=self.table_enable
161
- )
162
-
163
- base_path = f"/topic-extraction/{key}/"
164
- writer = S3ImageWriter(self.s3_writer, "/topic-extraction/", self.gemini_api_key)
165
- md_prefix = "/topic-extraction/"
166
- pipe_result = inference.pipe_ocr_mode(writer, lang=self.language)
167
- md_content = pipe_result.get_markdown(md_prefix)
168
- final_markdown = writer.post_process(md_prefix, md_content)
169
-
170
- output_md_path = os.path.join(self.output_folder, f"{key}_output.md")
171
- with open(output_md_path, "w", encoding="utf-8") as f:
172
- f.write(final_markdown)
173
- logger.info("Markdown output saved to %s", output_md_path)
 
 
 
 
 
 
 
 
174
 
175
- self.cleanup_gpu()
176
- return final_markdown
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
 
178
  def main():
179
- message = {
180
- "pattern": "topic_extraction",
181
- "data": {
182
- "input_files": [
183
- {
184
- "key": "sample_spec",
185
- "url": "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf",
186
- "type": "specification",
187
- "page": [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39, 40, 41, 42]
188
- }
189
- ],
190
- "topics": [
191
- {
192
- "title": "Sample Topic",
193
- "id": 123
194
- }
195
- ]
196
- }
197
  }
198
- data = message.get("data", {})
199
- input_files = data.get("input_files", [])
200
-
201
- output_folder = "output"
202
-
203
- gemini_api_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
204
 
205
- s3_config = {
206
- "ak": os.getenv("S3_ACCESS_KEY"),
207
- "sk": os.getenv("S3_SECRET_KEY"),
208
- "bucket": "quextro-resources",
209
- "endpoint_url": os.getenv("S3_ENDPOINT")
210
- }
211
 
212
- processor = TopicExtractionProcessor(
213
- gemini_api_key=gemini_api_key,
214
- s3_config=s3_config,
215
- output_folder=output_folder
216
- )
217
-
218
- for input_file in message["data"].get("input_files", []):
219
- try:
220
- logger.info("Processing input file with key: %s", input_file.get("key", ""))
221
- final_md = processor.process_input_file(input_file)
222
- logger.info("Processing completed for key: %s", input_file.get("key", ""))
223
- except Exception as e:
224
- logger.error("Error processing input file: %s", e)
225
 
226
  if __name__ == "__main__":
227
- main()
 
1
  #!/usr/bin/env python3
2
  import os
 
3
  import json
4
  import logging
5
  import gc
 
7
  import requests
8
  import torch
9
  import boto3
10
+ from io import BytesIO
11
+ from typing import Dict, List, Any
12
 
13
  from magic_pdf.data.dataset import PymuDocDataset
14
  from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
15
 
16
+ logging.basicConfig(
17
+ level=logging.INFO,
18
+ format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
19
+ handlers=[
20
+ logging.StreamHandler(),
21
+ logging.FileHandler('topic_processor.log')
22
+ ]
23
+ )
24
  logger = logging.getLogger(__name__)
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  class s3Writer:
27
  def __init__(self, ak: str, sk: str, bucket: str, endpoint_url: str):
28
  self.bucket = bucket
 
35
 
36
  def write(self, path: str, data: bytes) -> None:
37
  try:
 
38
  file_obj = BytesIO(data)
39
  self.client.upload_fileobj(file_obj, self.bucket, path)
40
  logger.info(f"Uploaded to S3: {path}")
 
63
  md_content = md_content.replace(f"![]({key}{path})", f"![]({s3_path})")
64
  return md_content
65
 
66
+ def delete_non_heading_text(md_content: str) -> str:
67
+ filtered_lines = []
68
+ for line in md_content.splitlines():
69
+ stripped = line.lstrip()
70
+ if stripped.startswith('#') or stripped.startswith('![]('):
71
+ filtered_lines.append(line)
72
+ return "\n".join(filtered_lines)
73
+
74
  class TopicExtractionProcessor:
75
+ def __init__(self, gemini_api_key: str = None):
76
+ try:
77
+ self.s3_writer = s3Writer(
78
+ ak=os.getenv("S3_ACCESS_KEY"),
79
+ sk=os.getenv("S3_SECRET_KEY"),
80
+ bucket="quextro-resources",
81
+ endpoint_url=os.getenv("S3_ENDPOINT")
82
+ )
83
+
84
+ config_path = "/home/user/magic-pdf.json"
85
+ if os.path.exists(config_path):
86
+ with open(config_path, "r") as f:
87
+ config = json.load(f)
88
+ self.layout_model = config.get("layout-config", {}).get("model", "doclayout_yolo")
89
+ self.formula_enable = config.get("formula-config", {}).get("enable", True)
90
+ else:
91
+ self.layout_model = "doclayout_yolo"
92
+ self.formula_enable = True
93
+
94
+ self.table_enable = False
95
+ self.language = "en"
96
+ self.gemini_api_key = gemini_api_key or os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
97
+
98
+ logger.info("TopicExtractionProcessor initialized successfully")
99
+ except Exception as e:
100
+ logger.error("Failed to initialize TopicExtractionProcessor: %s", str(e))
101
+ raise
102
 
103
  def cleanup_gpu(self):
104
  try:
 
106
  torch.cuda.empty_cache()
107
  logger.info("GPU memory cleaned up.")
108
  except Exception as e:
109
+ logger.error("Error during GPU cleanup: %s", e)
110
+
111
+ def process(self, input_file: Dict[str, Any]) -> str:
112
+ try:
113
+ key = input_file.get("key", "")
114
+ url = input_file.get("url", "")
115
+ page_field = input_file.get("page")
116
+
117
+ if not url or not page_field:
118
+ raise ValueError("Missing required 'url' or 'page' in input file")
119
+
120
+ page_indices = self.parse_page_range(page_field)
121
+ logger.info("Processing %s with pages %s", key, page_indices)
122
+
123
+ if url.startswith(("http://", "https://")):
124
+ response = requests.get(url)
125
+ response.raise_for_status()
126
+ pdf_bytes = response.content
127
+ else:
128
+ with open(url, "rb") as f:
129
+ pdf_bytes = f.read()
130
+
131
+ subset_pdf = self.create_subset_pdf(pdf_bytes, page_indices)
132
+
133
+ dataset = PymuDocDataset(subset_pdf)
134
+ inference = doc_analyze(
135
+ dataset,
136
+ ocr=True,
137
+ lang=self.language,
138
+ layout_model=self.layout_model,
139
+ formula_enable=self.formula_enable,
140
+ table_enable=self.table_enable
141
+ )
142
+
143
+ base_path = f"/topic-extraction/{key}/"
144
+ writer = S3ImageWriter(self.s3_writer, "/topic-extraction/", self.gemini_api_key)
145
+ md_prefix = "/topic-extraction/"
146
+ pipe_result = inference.pipe_ocr_mode(writer, lang=self.language)
147
+ md_content = pipe_result.get_markdown(md_prefix)
148
+ post_processed = writer.post_process(md_prefix, md_content)
149
+
150
+ #remove non-heading text from the markdown output
151
+ final_markdown = delete_non_heading_text(post_processed)
152
+
153
+ return final_markdown
154
+
155
+ except Exception as e:
156
+ logger.error("Processing failed for %s: %s", key, str(e))
157
+ raise
158
+ finally:
159
+ self.cleanup_gpu()
160
+
161
+ def create_subset_pdf(self, pdf_bytes: bytes, page_indices: List[int]) -> bytes:
162
+ """Create a PDF subset from specified pages"""
163
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
164
+ new_doc = fitz.open()
165
 
166
+ try:
167
+ for p in sorted(set(page_indices)):
168
+ if 0 <= p < doc.page_count:
169
+ new_doc.insert_pdf(doc, from_page=p, to_page=p)
170
+ else:
171
+ raise ValueError(f"Page index {p} out of range (0-{doc.page_count-1})")
172
+ return new_doc.tobytes()
173
+ finally:
174
+ new_doc.close()
175
+ doc.close()
176
+
177
+ def parse_page_range(self, page_field) -> List[int]:
178
+ """Parse page range from input (1-indexed to 0-indexed)"""
179
+ if isinstance(page_field, list):
180
+ return [int(p) - 1 for p in page_field]
181
+ if isinstance(page_field, str):
182
+ parts = [p.strip() for p in page_field.split(',')]
183
+ return [int(p) - 1 for p in parts]
184
+ raise ValueError("Invalid page field type")
185
 
186
  def main():
187
+ """Local test execution without RabbitMQ"""
188
+ test_input = {
189
+ "key": "local_test",
190
+ "url": "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf", # Local PDF path
191
+ "page":[15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39, 40, 41, 42]
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  }
 
 
 
 
 
 
193
 
194
+ processor = TopicExtractionProcessor()
 
 
 
 
 
195
 
196
+ try:
197
+ logger.info("Starting test processing.")
198
+ result = processor.process(test_input)
199
+ logger.info("Processing completed successfully")
200
+ print("Markdown:\n", result)
201
+ except Exception as e:
202
+ logger.error("Test failed: %s", str(e))
 
 
 
 
 
 
203
 
204
  if __name__ == "__main__":
205
+ main()
topic_extraction.py DELETED
@@ -1,988 +0,0 @@
1
- #!/usr/bin/env python3
2
- import os
3
- import re
4
- import gc
5
- import json
6
- import logging
7
- import fitz
8
- import boto3
9
- import base64
10
- import time
11
- import asyncio
12
- import tempfile
13
- import requests
14
- from io import BytesIO
15
- from typing import List, Dict, Any
16
-
17
- import torch
18
- import cv2
19
- import numpy as np
20
-
21
- from google import genai
22
- from google.genai import types
23
-
24
- from magic_pdf.data.dataset import PymuDocDataset
25
- from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
26
- from magic_pdf.data.data_reader_writer.base import DataWriter
27
- from table_row_extraction import TableExtractor
28
-
29
- logging.basicConfig(level=logging.INFO)
30
- logger = logging.getLogger(__name__)
31
- logger.setLevel(logging.INFO)
32
- file_handler = logging.FileHandler("topic_extraction.log")
33
- file_handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(name)s - %(message)s"))
34
- logger.addHandler(file_handler)
35
-
36
- _GEMINI_CLIENT = None
37
-
38
- #helper functions, also global
39
- def unify_whitespace(text: str) -> str:
40
- return re.sub(r"\s+", " ", text).strip()
41
-
42
- def find_all_occurrences(pdf_bytes: bytes, search_text: str) -> List[int]:
43
- doc = fitz.open(stream=pdf_bytes, filetype="pdf")
44
- st_norm = unify_whitespace(search_text)
45
- found = []
46
- for i in range(doc.page_count):
47
- raw = doc[i].get_text("raw")
48
- norm = unify_whitespace(raw)
49
- if st_norm in norm:
50
- found.append(i)
51
- doc.close()
52
- return sorted(found)
53
-
54
- def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> bytes:
55
- if not page_indices:
56
- raise ValueError("No page indices provided for subset creation.")
57
- doc = fitz.open(stream=original_pdf_bytes, filetype="pdf")
58
- new_doc = fitz.open()
59
- for p in sorted(set(page_indices)):
60
- if 0 <= p < doc.page_count:
61
- new_doc.insert_pdf(doc, from_page=p, to_page=p)
62
- else:
63
- logger.error(f"Page index {p} out of range (0..{doc.page_count - 1}).")
64
- raise ValueError(f"Page index {p} out of range.")
65
- subset_bytes = new_doc.tobytes()
66
- new_doc.close()
67
- doc.close()
68
- return subset_bytes
69
-
70
- def unify_topic_name(raw_title: str, children_subtopics: list) -> str:
71
- """
72
- Clean up a topic title:
73
- - Remove any trailing "continued".
74
- - If the title does not start with a number but children provide a consistent numeric prefix,
75
- then prepend that prefix.
76
- """
77
- title = raw_title.strip()
78
- # Remove trailing "continued"
79
- title = re.sub(r"\s+continued\s*$", "", title, flags=re.IGNORECASE)
80
-
81
- # If title already starts with a number, use it as is.
82
- if re.match(r"^\d+", title):
83
- return title
84
-
85
- # Otherwise, try to deduce a numeric prefix from the children.
86
- prefixes = []
87
- for child in children_subtopics:
88
- child_title = child.get("title", "").strip()
89
- m = re.match(r"^(\d+)\.", child_title)
90
- if m:
91
- prefixes.append(m.group(1))
92
- if prefixes:
93
- # If all numeric prefixes in children are the same, use that prefix.
94
- if all(p == prefixes[0] for p in prefixes):
95
- # If title is non-empty, prepend the number; otherwise, use a fallback.
96
- if title:
97
- title = f"{prefixes[0]} {title}"
98
- else:
99
- title = f"{prefixes[0]} Topic"
100
- # Optionally, handle known broken titles explicitly.
101
- if title.lower() in {"gonometry"}:
102
- # For example, if children indicate "5.X", set to "5 Trigonometry"
103
- if prefixes and prefixes[0] == "5":
104
- title = "5 Trigonometry"
105
- return title
106
-
107
-
108
- def merge_topics(subtopic_list: list) -> list:
109
- """
110
- Merge topics with an enhanced logic:
111
- 1. Clean up each topic's title using unify_topic_name.
112
- 2. Group topics by the parent's numeric prefix (if available). Topics without a numeric prefix use their title.
113
- 3. Reassign children: for each child whose title (e.g. "3.1") does not match its current parent's numeric prefix,
114
- move it to the parent with the matching prefix if available.
115
- 4. Remove duplicate children by merging contents.
116
- 5. Sort parent topics and each parent's children by their numeric ordering.
117
- """
118
- # First, merge topics by parent's numeric prefix.
119
- merged = {}
120
- for topic_obj in subtopic_list:
121
- raw_title = topic_obj.get("title", "")
122
- children = topic_obj.get("children", [])
123
- contents = topic_obj.get("contents", [])
124
- new_title = unify_topic_name(raw_title, children)
125
- # Extract parent's numeric prefix, if present.
126
- m = re.match(r"^(\d+)", new_title)
127
- parent_prefix = m.group(1) if m else None
128
- key = parent_prefix if parent_prefix is not None else new_title
129
-
130
- if key not in merged:
131
- merged[key] = {
132
- "title": new_title,
133
- "contents": list(contents),
134
- "children": list(children),
135
- }
136
- else:
137
- # Merge contents and children; choose the longer title.
138
- if len(new_title) > len(merged[key]["title"]):
139
- merged[key]["title"] = new_title
140
- merged[key]["contents"].extend(contents)
141
- merged[key]["children"].extend(children)
142
-
143
- # Build a lookup of merged topics by their numeric prefix.
144
- parent_lookup = merged # keys are numeric prefixes or the full title for non-numeric ones.
145
-
146
- # Reassign children to the correct parent based on their numeric prefix.
147
- for key, topic in merged.items():
148
- new_children = []
149
- for child in topic["children"]:
150
- child_title = child.get("title", "").strip()
151
- m_child = re.match(r"^(\d+)\.", child_title)
152
- if m_child:
153
- child_prefix = m_child.group(1)
154
- if key != child_prefix and child_prefix in parent_lookup:
155
- # Reassign this child to the proper parent.
156
- parent_lookup[child_prefix]["children"].append(child)
157
- continue
158
- new_children.append(child)
159
- topic["children"] = new_children
160
-
161
- # Remove duplicate children by merging their contents.
162
- for topic in merged.values():
163
- child_map = {}
164
- for child in topic["children"]:
165
- ctitle = child.get("title", "").strip()
166
- if ctitle not in child_map:
167
- child_map[ctitle] = child
168
- else:
169
- child_map[ctitle]["contents"].extend(child.get("contents", []))
170
- child_map[ctitle]["children"].extend(child.get("children", []))
171
- topic["children"] = list(child_map.values())
172
-
173
- # Sort children by full numeric order (e.g. "2.1" < "2.10" < "2.2").
174
- def parse_subtopic_num(subtitle):
175
- digits = re.findall(r"\d+", subtitle)
176
- return tuple(int(d) for d in digits) if digits else (9999,)
177
- topic["children"].sort(key=lambda ch: parse_subtopic_num(ch.get("title", "")))
178
-
179
- # Convert merged topics to a sorted list.
180
- def parse_parent_num(topic):
181
- m = re.match(r"^(\d+)", topic.get("title", ""))
182
- return int(m.group(1)) if m else 9999
183
- final_list = list(merged.values())
184
- final_list.sort(key=lambda topic: parse_parent_num(topic))
185
- return final_list
186
-
187
- class s3Writer:
188
- def __init__(self, ak: str, sk: str, bucket: str, endpoint_url: str):
189
- self.bucket = bucket
190
- self.client = boto3.client(
191
- 's3',
192
- aws_access_key_id=ak,
193
- aws_secret_access_key=sk,
194
- endpoint_url=endpoint_url
195
- )
196
-
197
- def write(self, path: str, data: bytes) -> None:
198
- try:
199
- file_obj = BytesIO(data)
200
- self.client.upload_fileobj(
201
- file_obj,
202
- self.bucket,
203
- path
204
- )
205
- logger.info(f"Uploaded to S3: {path}")
206
- except Exception as e:
207
- logger.error(f"Failed to upload to S3: {str(e)}")
208
- raise
209
-
210
- def delete(self, path: str) -> None:
211
- try:
212
- self.client.delete_object(Bucket=self.bucket, Key=path)
213
- except Exception as e:
214
- logger.error(f"Failed to delete from S3: {str(e)}")
215
- raise
216
-
217
- def preprocess_image(image_data: bytes, max_dim: int = 600, quality: int = 60) -> bytes:
218
- arr = np.frombuffer(image_data, np.uint8)
219
- img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
220
- if img is not None:
221
- h, w, _ = img.shape
222
- if max(h, w) > max_dim:
223
- scale = max_dim / float(max(h, w))
224
- new_w = int(w * scale)
225
- new_h = int(h * scale)
226
- img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
227
- encode_params = [int(cv2.IMWRITE_JPEG_QUALITY), quality]
228
- success, enc = cv2.imencode(".jpg", img, encode_params)
229
- if success:
230
- return enc.tobytes()
231
- return image_data
232
-
233
- def call_gemini_for_table_classification(image_data: bytes, api_key: str, max_retries: int = 1) -> str:
234
- """
235
- Existing Gemini call to classify an image as TWO_COLUMN, THREE_COLUMN, or NO_TABLE.
236
- """
237
- for attempt in range(max_retries + 1):
238
- try:
239
- prompt = """You are given an image. Determine if it shows a table that has exactly 2 or 3 columns.
240
- The three-column 'table' image includes such key features:
241
- - Three columns header
242
- - Headers like 'Topics', 'Content', 'Guidelines', 'Amplification', 'Additional guidance notes', 'Area of Study'
243
- - Possibly sections (e.g. 8.4, 9.1)
244
- The two-column 'table' image includes such key features:
245
- - Two columns
246
- - Headers like 'Subject content', 'Additional information'
247
- - Possibly sections (e.g. 2.1, 3.4, G2, G3, )
248
- If the image is a relevant table with 2 columns, respond with 'TWO_COLUMN'.
249
- If the image is a relevant table with 3 columns, respond with 'THREE_COLUMN'.
250
- If the image is non-empty but does not show a table, respond with 'NO_TABLE'.
251
- Return only one of these exact labels.
252
- """
253
- global _GEMINI_CLIENT
254
- if _GEMINI_CLIENT is None:
255
- _GEMINI_CLIENT = genai.Client(api_key=api_key)
256
- client = _GEMINI_CLIENT
257
-
258
- resp = client.models.generate_content(
259
- model="gemini-2.0-flash",
260
- contents=[
261
- {
262
- "parts": [
263
- {"text": prompt},
264
- {
265
- "inline_data": {
266
- "mime_type": "image/jpeg",
267
- "data": base64.b64encode(image_data).decode('utf-8')
268
- }
269
- }
270
- ]
271
- }
272
- ],
273
- config=types.GenerateContentConfig(temperature=0.0)
274
- )
275
- if resp and resp.text:
276
- classification = resp.text.strip().upper()
277
- if "THREE" in classification:
278
- return "THREE_COLUMN"
279
- elif "TWO" in classification:
280
- return "TWO_COLUMN"
281
- elif "EMPTY" in classification:
282
- return "EMPTY_IMAGE"
283
- return "NO_TABLE"
284
- except Exception as e:
285
- logger.error(f"Gemini table classification error: {e}")
286
- if "503" in str(e):
287
- return "NO_TABLE"
288
- if attempt < max_retries:
289
- time.sleep(0.5)
290
- else:
291
- return "NO_TABLE"
292
-
293
- async def classify_image_async(image_data: bytes, api_key: str, max_retries: int = 1) -> str:
294
- loop = asyncio.get_event_loop()
295
- preprocessed = preprocess_image(image_data)
296
- return await loop.run_in_executor(None, call_gemini_for_table_classification, preprocessed, api_key, max_retries)
297
-
298
- def call_gemini_for_subtopic_identification_image(image_data: bytes, api_key: str, max_retries: int = 1) -> dict:
299
- for attempt in range(max_retries + 1):
300
- try:
301
- prompt = """
302
- You are given an image from an educational curriculum specification for Gemini Flash 2. The image may contain:
303
- 1) A main topic heading in the format: "<number> <Topic Name>", for example "2 Algebra and functions continued".
304
- 2) A subtopic heading in the format "<number>.<number>" or "<number>.<number>.<number>", for example "2.5", "2.6", "3.4", "2.1.1", "4.3.3" or "1.2.1".
305
- 3) A label-like title in the left column of a two-column table, for example "G2", "G3", "Scarcity, choice and opportunity cost", or similar text without explicit numeric patterns (2.1, 3.4, etc.).
306
- 4) Possibly no relevant text or only truncated text (e.g. "Topics", "Subject content", "What students need to learn", "Content Amplification Additional guidance notes", etc.).
307
-
308
- Your task is to extract:
309
- - **"title"**: A recognized main topic or heading text.
310
- - **"subtopics"**: Any recognized subtopic numbers (e.g. "2.5", "2.6", "3.4", "G2", "2.1.1", "4.1.1"), as an array of strings.
311
-
312
- Follow these rules:
313
-
314
- (1) **If the cell shows a main topic in the format "<number> <Topic Name>",** for example "2 Algebra and functions continued":
315
- - Remove the word "continued" if present.
316
- - Put that resulting text in "title". (e.g. "2 Algebra and functions")
317
- - "subtopics" should be an empty array, unless smaller subtopic numbers (e.g. "2.5") are also detected in the same text.
318
-
319
- (2) **If the cell shows one or more subtopic numbers** in the format "<number>.<number>", for example "2.5", "2.6", or "3.4":
320
- - Collect those exact strings in the JSON key "subtopics" (an array of strings).
321
- - "title" in this case should be an empty string if you only detect subtopics.
322
- (Example: If text is "2.5 Solve linear inequalities...", then "title" = "", "subtopics" = ["2.5"]).
323
-
324
- (3) **If no main topic or subtopic is detected but the text appears to be a heading**, for example "Specialisation, division of labour and exchange", then:
325
- - Return:
326
- {
327
- "title": "<the heading text>",
328
- "subtopics": []
329
- }
330
-
331
- (4) **If there is no numeric value in the left column** (e.g. "2.1" or "2 <Topic name>" not found) but the left column text appears to be a heading (for instance "Scarcity, choice and opportunity cost"), then:
332
- - Use that left column text as "title".
333
- - "subtopics" remains empty.
334
- Example:
335
- If the left column is "Scarcity, choice and opportunity cost" and the right column has definitions, your output is:
336
- {
337
- "title": "Scarcity, choice and opportunity cost",
338
- "subtopics": []
339
- }
340
-
341
- (5) **If there is no numeric value in the left column** (e.g. "2.1" or "2 <Topic name>" not found) or it appears to be a standalone column with text, treat it as a heading.
342
- - "subtopics" remains empty.
343
- Example:
344
- If there is only one column image that is "Specialisation, devision of labour and exchange" and the right column is not present, your output is:
345
- {
346
- "title": "Specialisation, devision of labour and exchange",
347
- "subtopics": []
348
- }
349
-
350
- (6) **If there is a character + digit pattern** in the left column of a two-column table (for example "G2", "G3", "G4", "C1"), treat that as a topic-like label:
351
- - Put that label text into "title" (e.g. "G2").
352
- - "subtopics" remains empty unless you also see actual subtopic formats like "2.5", "3.4" inside the same cell.
353
-
354
- (7) **Output must be valid JSON** in this exact structure, with no extra text or explanation:
355
- {
356
- "title": "...",
357
- "subtopics": [...]
358
- }
359
-
360
- (8) **If the image is blank or truncated**, defined as:
361
- - Contains no words at all (e.g. a blank white or black image), **OR**
362
- - Contains only snippet words/phrases such as "Topics", "Subject content", "Content Amplification Additional guidance notes", "What students need to learn" (including variations in background color), **OR**
363
- - Contains partial headings with no recognizable numeric or textual headings
364
- - Contains partial UI labels only, such as “Topics” in a gray bar or “What students need to learn” in a blue bar, with no additional meaningful text.
365
- then return:
366
- {
367
- "title": "EMPTY_IMAGE",
368
- "subtopics": []
369
- }
370
-
371
- (9) **If you cannot recognize any text matching the patterns above**, or the text is too partial/truncated to form a valid heading, also return:
372
- {
373
- "title": "EMPTY_IMAGE",
374
- "subtopics": []
375
- }
376
-
377
- **Examples**:
378
-
379
- - If the image text is "2 Algebra and functions continued", return:
380
- {
381
- "title": "2 Algebra and functions",
382
- "subtopics": []
383
- }
384
-
385
- - If the image text is "2.5 Solve linear and quadratic inequalities ...", return:
386
- {
387
- "title": "",
388
- "subtopics": ["2.5"]
389
- }
390
-
391
- - If the image text is "Specialisation, division of labour and exchange" (with no numeric patterns at all), return:
392
- {
393
- "title": "Specialisation, division of labour and exchange",
394
- "subtopics": []
395
- }
396
-
397
- - If the left column says "G2" and the right column has details, but no subtopic numbers, return:
398
- {
399
- "title": "G2",
400
- "subtopics": []
401
- }
402
-
403
- - If the image is blank or shows only partial/truncated snippet words (e.g. "Topics", "Content Amplification Additional guidance notes", "Subject content", "What students need to learn") and nothing else, return:
404
- {
405
- "title": "EMPTY_IMAGE",
406
- "subtopics": []
407
- }
408
- """
409
- global _GEMINI_CLIENT
410
- if _GEMINI_CLIENT is None:
411
- _GEMINI_CLIENT = genai.Client(api_key=api_key)
412
- client = _GEMINI_CLIENT
413
-
414
- resp = client.models.generate_content(
415
- model="gemini-2.0-flash",
416
- contents=[
417
- {
418
- "parts": [
419
- {"text": prompt},
420
- {
421
- "inline_data": {
422
- "mime_type": "image/jpeg",
423
- "data": base64.b64encode(image_data).decode("utf-8")
424
- }
425
- }
426
- ]
427
- }
428
- ],
429
- config=types.GenerateContentConfig(temperature=0.0)
430
- )
431
-
432
- if not resp or not resp.text:
433
- logger.warning("Gemini returned an empty response for subtopic extraction.")
434
- return {"title": "", "subtopics": []}
435
-
436
- raw = resp.text.strip()
437
- # Remove any markdown fences if present
438
- raw = raw.replace("```json", "").replace("```", "").strip()
439
- data = json.loads(raw)
440
-
441
- title = data.get("title", "")
442
- subtopics = data.get("subtopics", [])
443
- if title.upper() == "EMPTY_IMAGE":
444
- return {"title": "EMPTY_IMAGE", "subtopics": []}
445
- if not isinstance(subtopics, list):
446
- subtopics = []
447
- return {"title": title, "subtopics": subtopics}
448
-
449
- except Exception as e:
450
- logger.error(f"Gemini subtopic identification error on attempt {attempt}: {e}")
451
- if attempt < max_retries:
452
- time.sleep(0.5)
453
- else:
454
- return {"title": "", "subtopics": []}
455
-
456
- return {"title": "", "subtopics": []}
457
-
458
- class S3ImageWriter(DataWriter):
459
- def __init__(self, s3_writer: s3Writer, base_path: str, gemini_api_key: str):
460
- self.s3_writer = s3_writer
461
- self.base_path = base_path if base_path.endswith("/") else base_path + "/"
462
- self.gemini_api_key = gemini_api_key
463
- self.descriptions = {}
464
- self._img_count = 0
465
- self.extracted_tables = {}
466
-
467
- self.extracted_subtopics = {}
468
-
469
- def write(self, path: str, data: bytes) -> None:
470
- self._img_count += 1
471
- unique_id = f"img_{self._img_count}.jpg"
472
- s3_key = f"{self.base_path}{unique_id}"
473
- self.s3_writer.write(s3_key, data)
474
- self.descriptions[path] = {
475
- "data": data,
476
- "s3_path": s3_key,
477
- "table_classification": "NO_TABLE",
478
- "final_alt": ""
479
- }
480
-
481
- async def post_process_async(self, key: str, md_content: str) -> str:
482
- logger.info("Classifying images to detect tables.")
483
- tasks = {
484
- p: asyncio.create_task(classify_image_async(info["data"], self.gemini_api_key))
485
- for p, info in self.descriptions.items()
486
- }
487
- results = await asyncio.gather(*tasks.values(), return_exceptions=True)
488
- for p, result in zip(list(self.descriptions.keys()), results):
489
- if isinstance(result, Exception):
490
- logger.error(f"Table classification error for {p}: {result}")
491
- self.descriptions[p]['table_classification'] = "NO_TABLE"
492
- else:
493
- self.descriptions[p]['table_classification'] = result
494
-
495
- # Process each image description.
496
- for p, info in list(self.descriptions.items()):
497
- cls = info['table_classification']
498
- if cls == "TWO_COLUMN":
499
- info['final_alt'] = "HAS TO BE PROCESSED - two column table"
500
- elif cls == "THREE_COLUMN":
501
- info['final_alt'] = "HAS TO BE PROCESSED - three column table"
502
- elif cls == "EMPTY_IMAGE":
503
- md_content = md_content.replace(f"![]({key}{p})", "")
504
- try:
505
- self.s3_writer.delete(info['s3_path'])
506
- except Exception as e:
507
- logger.error(f"Error deleting S3 object {info['s3_path']}: {e}")
508
- del self.descriptions[p]
509
- continue
510
- else:
511
- info['final_alt'] = "NO_TABLE image"
512
- md_content = md_content.replace(f"![]({key}{p})", f"![{info['final_alt']}]({info['s3_path']})")
513
-
514
- md_content = await self._process_table_images_in_markdown(key, md_content)
515
-
516
- # Filter final lines to keep only lines with images.
517
- final_lines = [
518
- line.strip() for line in md_content.split("\n")
519
- if re.match(r"^\!\[.*\]\(.*\)", line.strip())
520
- ]
521
- return "\n".join(final_lines)
522
-
523
- async def _process_table_images_in_markdown(self, key: str, md_content: str) -> str:
524
- pat = r"!\[HAS TO BE PROCESSED - (two|three) column table\]\(([^)]+)\)"
525
- matches = re.findall(pat, md_content, flags=re.IGNORECASE)
526
- if not matches:
527
- return md_content
528
-
529
- for (col_type, s3_key) in matches:
530
- logger.info(f"Processing table image: {s3_key}, columns={col_type}")
531
- img_data = None
532
- for desc in self.descriptions.values():
533
- if desc.get("s3_path") == s3_key:
534
- img_data = desc.get("data")
535
- break
536
- if img_data is None:
537
- logger.warning(f"No image data found for S3 key {s3_key}. Skipping.")
538
- continue
539
-
540
- # Write temporary file for processing.
541
- with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file:
542
- temp_file.write(img_data)
543
- temp_path = temp_file.name
544
-
545
- try:
546
- if col_type.lower() == 'two':
547
- extractor = TableExtractor(
548
- skip_header=True,
549
- merge_two_col_rows=True,
550
- enable_subtopic_merge=True,
551
- subtopic_threshold=0.2
552
- )
553
- else:
554
- extractor = TableExtractor(
555
- skip_header=True,
556
- merge_two_col_rows=False,
557
- enable_subtopic_merge=False,
558
- subtopic_threshold=0.2
559
- )
560
- row_boxes = extractor.process_image(temp_path)
561
-
562
- # logger.info(f"Extracted {len(row_boxes)} rows from {temp_path}")
563
- # for i, row in enumerate(row_boxes):
564
- # logger.info(f"Row {i} has {len(row)} cells")
565
-
566
- out_folder = temp_path + "_rows"
567
- os.makedirs(out_folder, exist_ok=True)
568
- # out_folder = os.path.join(os.path.dirname(temp_path), os.path.basename(temp_path) + "_rows")
569
- # os.makedirs(out_folder, exist_ok=True)
570
-
571
- extractor.save_extracted_cells(temp_path, row_boxes, out_folder)
572
- #just to print structure how cells are saved and named for each table image
573
- # logger.info(f"Files in {out_folder}:")
574
- # for root, dirs, files in os.walk(out_folder):
575
- # logger.info(f"{root}: {files}")
576
-
577
- recognized_main_topic = ""
578
- main_topic_image_key = None
579
- recognized_subtopics = []
580
-
581
- # Loop over each cell image.
582
- for i, row in enumerate(row_boxes):
583
- row_dir = os.path.join(out_folder, f"row_{i}")
584
- for j, _ in enumerate(row):
585
- cell_path = os.path.join(row_dir, f"col_{j}.png")
586
- if not os.path.isfile(cell_path):
587
- alternative_path = os.path.join(row_dir, f"col_{j}.jpg")
588
- if os.path.isfile(alternative_path):
589
- cell_path = alternative_path
590
- else:
591
- logger.warning(f"Cell image not found: {cell_path}")
592
- continue
593
-
594
- with open(cell_path, "rb") as cf:
595
- cell_image_data = cf.read()
596
-
597
- cell_key = f"{self.base_path}cells/{os.path.basename(s3_key)}_r{i}_c{j}.png"
598
- self.s3_writer.write(cell_key, cell_image_data)
599
-
600
- #extract subtopic info from the cell image.
601
- info = call_gemini_for_subtopic_identification_image(cell_image_data, self.gemini_api_key)
602
-
603
- # Check if the image is empty.
604
- if info.get("title", "").upper() == "EMPTY_IMAGE":
605
- try:
606
- self.s3_writer.delete(cell_key)
607
- logger.info(f"Deleted empty cell image from S3: {cell_key}")
608
- except Exception as e:
609
- logger.error(f"Error deleting empty cell image {cell_key}: {e}")
610
- continue # Skip processing this cell further
611
-
612
- if info["title"] and not recognized_main_topic:
613
- recognized_main_topic = info["title"]
614
- main_topic_image_key = cell_key
615
-
616
- for st in info["subtopics"]:
617
- recognized_subtopics.append({
618
- "title": st,
619
- "contents": [{"type": "image", "key": cell_key}],
620
- "children": []
621
- })
622
-
623
- final_json = {
624
- "title": recognized_main_topic,
625
- "contents": [],
626
- "children": recognized_subtopics
627
- }
628
- if main_topic_image_key:
629
- final_json["contents"].append({"type": "image", "key": main_topic_image_key})
630
-
631
- # Save the final JSON.
632
- self.extracted_subtopics[s3_key] = final_json
633
-
634
- # Optionally, create a snippet to replace the markdown line.
635
- snippet = ["**Extracted table cells:**"]
636
- for i, row in enumerate(row_boxes):
637
- for j, _ in enumerate(row):
638
- snippet.append(f"![Row {i} Col {j}]({self.base_path}cells/{os.path.basename(s3_key)}_r{i}_c{j}.jpg)")
639
- new_snip = "\n".join(snippet)
640
- old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({s3_key})"
641
- md_content = md_content.replace(old_line, new_snip)
642
-
643
- except Exception as e:
644
- logger.error(f"Error processing table image {s3_key}: {e}")
645
- finally:
646
- os.remove(temp_path)
647
-
648
- return md_content
649
-
650
- def post_process(self, key: str, md_content: str) -> str:
651
- return asyncio.run(self.post_process_async(key, md_content))
652
-
653
- class GeminiTopicExtractor:
654
- def __init__(self, api_key: str = None, num_pages: int = 14):
655
- self.api_key = api_key or os.getenv("GEMINI_API_KEY", "")
656
- self.num_pages = num_pages
657
-
658
- def extract_subtopics(self, pdf_path: str) -> Dict[str, List[int]]:
659
- first_pages_text = self._read_first_pages_raw(pdf_path, self.num_pages)
660
- if not first_pages_text.strip():
661
- logger.error("No text from first pages => cannot extract subtopics.")
662
- return {}
663
- prompt = f"""
664
- You have the first pages of a PDF specification, including a table of contents.
665
- Instructions:
666
- 1. Identify the 'Contents' section listing all topics, subtopics, and their corresponding pages.
667
- 2. Identify the major academic subtopics (common desired topic names "Paper X", "Theme X", "Content of X", "AS Unit X", "A2 Unit X", or similar headings).
668
- 3. For each subtopic, give the range of pages [start_page, end_page] (1-based) from the table of contents.
669
- 4. Output only valid JSON of the form:
670
- {{
671
- "Subtopic A": [start_page, end_page],
672
- "Subtopic B": [start_page, end_page]
673
- }}
674
- 5. If you can't find any subtopics, return an empty JSON.
675
- Important notes:
676
- - The correct "end_page" must be the page number of the next topic or subtopic minus 1.
677
- - The final output must be valid JSON only, with no extra text or code blocks.
678
- Examples:
679
- 1. Given this table of contents:
680
- 1 Introduction – 2
681
- Why choose Edexcel A Level Mathematics? - 2
682
- Supporting you in planning and implementing this qualification - 3
683
- Qualification at a glance - 5
684
- 2 Subject content and assessment information – 7
685
- Paper 1 and Paper 2: Pure Mathematics - 11
686
- Paper 3: Statistics and Mechanics - 30
687
- Assessment Objectives - 40
688
- 3 Administration and general information – 42
689
- Entries - 42
690
- Access arrangements, reasonable adjustments, special consideration and malpractice - 42
691
- Student recruitment and progression - 45
692
- Appendix 1: Formulae – 49
693
- Appendix 2: Notation – 53
694
- Appendix 3: Use of calculators – 59
695
- Appendix 4: Assessment Objectives – 60
696
- Appendix 5: The context for the development of this qualification – 62
697
- Appendix 6: Transferable skills – 64
698
- Appendix 7: Level 3 Extended Project qualification – 65
699
- Appendix 8: Codes – 67
700
- The correct output should be:
701
- {{
702
- "Paper 1 and Paper 2: Pure Mathematics": [11, 29],
703
- "Paper 3: Statistics and Mechanics": [30, 42]
704
- }}
705
- 2. Given this table of contents:
706
- Qualification at a glance – 1
707
- Assessment Objectives and weightings - 4
708
- Knowledge, skills and understanding – 5
709
- Theme 1: Introduction to markets and market failure - 5
710
- Theme 2: The UK economy – performance and policies - 11
711
- Theme 3: Business behaviour and the labour market - 21
712
- Theme 4: A global perspective - 29
713
- Assessment – 39
714
- Assessment summary - 39
715
- Assessment objectives - 41
716
- Assessment overview - 42
717
- Breakdown of assessment objectives - 42
718
- Synoptic assessment - 43
719
- Discount code and performance tables - 43
720
- Access arrangements, reasonable adjustments and special consideration - 44
721
- Malpractice - 45
722
- Equality Act 2010 and Pearson equality policy - 45
723
- Synoptic assessment - 46
724
- Awarding and reporting - 47
725
- Other information – 49
726
- Student recruitment -49
727
- Prior learning and other requirements -49
728
- Progression - 49
729
- Appendix 1: Transferable skills – 53
730
- Appendix 2: Level 3 Extended Project qualification – 55
731
- Appendix 3: Quantitative skills – 59
732
- Appendix 4: Codes – 61
733
- Appendix 5: Index – 63
734
- The correct output should be:
735
- {{
736
- "Theme 1: Introduction to markets and market failure": [5, 10],
737
- "Theme 2: The UK economy – performance and policies": [11, 20],
738
- "Theme 3: Business behaviour and the labour market": [21, 28],
739
- "Theme 4: A global perspective": [29, 38]
740
- }}
741
- 3. You might also see sections like:
742
- 2.1 AS Unit 1 11
743
- 2.2 AS Unit 2 18
744
- 2.3 A2 Unit 3 24
745
- 2.4 A2 Unit 4 31
746
- In that scenario, your output might look like:
747
- {{
748
- "2.1 AS Unit 1": [11, 17],
749
- "2.2 AS Unit 2": [18, 23],
750
- "2.3 A2 Unit 3": [24, 30],
751
- "2.4 A2 Unit 4": [31, 35]
752
- }}
753
- or
754
- 2.1 AS units 6
755
- 2.2 AS units 23
756
- In that scenario, your output might look like:
757
- {{
758
- "2.1 AS Unit 1": [6, 2],
759
- "2.2 AS Unit 2": [23, 43]
760
- }}
761
-
762
- 4. Another example might list subtopics:
763
- 3.1 Overarching themes 11
764
- 3.2 A: Proof 12
765
- 3.3 B: Algebra and functions 13
766
- 3.4 C: Coordinate geometry in the ( x , y ) plane 14
767
- 3.5 D: Sequences and series 15
768
- 3.6 E: Trigonometry 16
769
- 3.7 F: Exponentials and logarithms 17
770
- 3.8 G: Differentiation 18
771
- 3.9 H: Integration 19
772
- 3.10 I: Numerical methods 20
773
- 3.11 J: Vectors 20
774
- 3.12 K: Statistical sampling 21
775
- 3.13 L: Data presentation and interpretation 21
776
- 3.14 M: Probability 22
777
- 3.15 N: Statistical distributions 23
778
- 3.16 O: Statistical hypothesis testing 23
779
- 3.17 P: Quantities and units in mechanics 24
780
- 3.18 Q: Kinematics 24
781
- 3.19 R: Forces and Newton’s laws 24
782
- 3.20 S: Moments 25
783
- 3.21 Use of data in statistics 26
784
- Here the correct output might look like:
785
- {{
786
- "A: Proof": [12, 12],
787
- "B: Algebra and functions": [13, 13],
788
- ...
789
- }}
790
- Now, extract topics from this text:
791
- {first_pages_text}
792
- """
793
- global _GEMINI_CLIENT
794
- if _GEMINI_CLIENT is None:
795
- _GEMINI_CLIENT = genai.Client(api_key=self.api_key)
796
- client = _GEMINI_CLIENT
797
- try:
798
- response = client.models.generate_content(
799
- model="gemini-2.0-flash",
800
- contents=[prompt],
801
- config=types.GenerateContentConfig(temperature=0.0)
802
- )
803
- if not response or not response.text:
804
- logger.warning("No text from LLM => returning empty subtopics.")
805
- return {}
806
- raw_json = response.text.strip()
807
- cleaned = raw_json.replace("```json", "").replace("```", "")
808
- try:
809
- data = json.loads(cleaned)
810
- except Exception as json_err:
811
- logger.error(f"JSON parsing error: {json_err}")
812
- return {}
813
- final_dict = {}
814
- found_sub_dict = None
815
- for k, v in data.items():
816
- if isinstance(v, dict):
817
- found_sub_dict = v
818
- break
819
- if found_sub_dict is not None:
820
- for subk, rng in found_sub_dict.items():
821
- if isinstance(rng, list) and len(rng) == 2:
822
- final_dict[subk] = rng
823
- else:
824
- for subk, rng in data.items():
825
- if isinstance(rng, list) and len(rng) == 2:
826
- final_dict[subk] = rng
827
- return final_dict
828
- except Exception as e:
829
- logger.error(f"Gemini subtopic extraction error: {e}")
830
- return {}
831
-
832
- def _read_first_pages_raw(self, pdf_path: str, num_pages: int) -> str:
833
- text_parts = []
834
- try:
835
- if pdf_path.startswith("http://") or pdf_path.startswith("https://"):
836
- response = requests.get(pdf_path)
837
- if response.status_code != 200:
838
- logger.error("Failed to download PDF from %s. Status code: %d", pdf_path, response.status_code)
839
- return ""
840
- pdf_bytes = response.content
841
- else:
842
- with open(pdf_path, "rb") as f:
843
- pdf_bytes = f.read()
844
- doc = fitz.open(stream=pdf_bytes, filetype="pdf")
845
- pages_to_read = min(num_pages, doc.page_count)
846
- for i in range(pages_to_read):
847
- raw_text = doc[i].get_text("raw")
848
- text_parts.append(raw_text)
849
- doc.close()
850
- except Exception as e:
851
- logger.error(f"Could not open PDF: {e}")
852
- return "\n".join(text_parts)
853
-
854
- class MineruNoTextProcessor:
855
- def __init__(self, output_folder: str, gemini_api_key: str):
856
- self.output_folder = output_folder
857
- os.makedirs(self.output_folder, exist_ok=True)
858
- self.layout_model = "doclayout_yolo"
859
- self.formula_enable = True
860
- self.table_enable = False
861
- self.language = "en"
862
-
863
- self.subtopic_extractor = GeminiTopicExtractor(api_key=gemini_api_key, num_pages=20)
864
- self.gemini_api_key = gemini_api_key or os.getenv("GEMINI_API_KEY", "")
865
-
866
- self.use_s3 = True
867
- self.s3_writer = s3Writer(
868
- ak=os.getenv("S3_ACCESS_KEY"),
869
- sk=os.getenv("S3_SECRET_KEY"),
870
- bucket="quextro-resources",
871
- endpoint_url=os.getenv("S3_ENDPOINT")
872
- )
873
-
874
- def cleanup_gpu(self):
875
- try:
876
- gc.collect()
877
- torch.cuda.empty_cache()
878
- logger.info("GPU memory cleaned up.")
879
- except Exception as e:
880
- logger.error(f"Error during GPU cleanup: {e}")
881
-
882
- def process(self, pdf_path: str) -> Dict[str, Any]:
883
- logger.info(f"Processing PDF: {pdf_path}")
884
- try:
885
- subtopics = self.subtopic_extractor.extract_subtopics(pdf_path)
886
- logger.info(f"Gemini returned subtopics: {subtopics}")
887
-
888
- if pdf_path.startswith("http://") or pdf_path.startswith("https://"):
889
- response = requests.get(pdf_path)
890
- if response.status_code != 200:
891
- logger.error("Failed to download PDF from %s. Status code: %d", pdf_path, response.status_code)
892
- raise Exception(f"Failed to download PDF: {pdf_path}")
893
- pdf_bytes = response.content
894
- logger.info("Downloaded %d bytes for pdf_url='%s'", len(pdf_bytes), pdf_path)
895
- else:
896
- with open(pdf_path, "rb") as f:
897
- pdf_bytes = f.read()
898
- logger.info("Loaded %d bytes from local file '%s'", len(pdf_bytes), pdf_path)
899
-
900
- doc = fitz.open(stream=pdf_bytes, filetype="pdf")
901
- total_pages = doc.page_count
902
- doc.close()
903
-
904
- # Decide which pages to process
905
- final_pages = set()
906
- if not subtopics:
907
- # fallback
908
- final_pages = set(range(total_pages))
909
- else:
910
- offset_candidates = []
911
- for subname, rng in subtopics.items():
912
- start_p, _ = rng
913
- occs = find_all_occurrences(pdf_bytes, subname)
914
- for p in occs:
915
- candidate = p - (start_p - 1)
916
- if candidate > 0:
917
- offset_candidates.append(candidate)
918
- if offset_candidates:
919
- try:
920
- from statistics import mode
921
- global_offset = mode(offset_candidates)
922
- except:
923
- from statistics import median
924
- global_offset = int(median(offset_candidates))
925
- else:
926
- global_offset = 0
927
-
928
- logger.info(f"Computed global offset: {global_offset}")
929
- for subname, rng in subtopics.items():
930
- if not (isinstance(rng, list) and len(rng) == 2):
931
- continue
932
- start_p, end_p = rng
933
- if start_p > end_p:
934
- continue
935
- s0 = (start_p - 1) + global_offset
936
- e0 = (end_p - 1) + global_offset
937
- for pp in range(s0, e0 + 1):
938
- final_pages.add(pp)
939
-
940
- if not final_pages:
941
- final_pages = set(range(total_pages))
942
-
943
- logger.info(f"Processing pages (0-based): {sorted(final_pages)}")
944
- subset_pdf_bytes = create_subset_pdf(pdf_bytes, sorted(final_pages))
945
-
946
- # 4) Analyze and produce markdown
947
- dataset = PymuDocDataset(subset_pdf_bytes)
948
- inference = doc_analyze(
949
- dataset,
950
- ocr=True,
951
- lang=self.language,
952
- layout_model=self.layout_model,
953
- formula_enable=self.formula_enable,
954
- table_enable=self.table_enable
955
- )
956
- #S3
957
- writer = S3ImageWriter(self.s3_writer, "/topic-extraction", self.gemini_api_key)
958
-
959
- md_prefix = "/topic-extraction/"
960
- pipe_result = inference.pipe_ocr_mode(writer, lang=self.language)
961
- md_content = pipe_result.get_markdown(md_prefix)
962
- final_markdown = writer.post_process(md_prefix, md_content)
963
-
964
- subtopic_list = list(writer.extracted_subtopics.values())
965
- subtopic_list = merge_topics(subtopic_list)
966
-
967
- out_path = os.path.join(self.output_folder, "_subtopics.json")
968
- with open(out_path, "w", encoding="utf-8") as f:
969
- json.dump(subtopic_list, f, indent=2)
970
- logger.info(f"Final subtopics JSON saved locally at {out_path}")
971
-
972
- return {
973
- "final_markdown": final_markdown,
974
- "subtopics_extracted": subtopic_list
975
- }
976
- finally:
977
- self.cleanup_gpu()
978
-
979
- if __name__ == "__main__":
980
- input_pdf = "/home/user/app/input_output/wjec-gce-as-a-economics-specification-from-2015.pdf"
981
- output_dir = "/home/user/app/pearson_json"
982
- api_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
983
- try:
984
- processor = MineruNoTextProcessor(output_folder=output_dir, gemini_api_key=api_key)
985
- result = processor.process(input_pdf)
986
- logger.info("Processing completed successfully.")
987
- except Exception as e:
988
- logger.error(f"Processing failed: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
topic_extraction.log → topic_processor.log RENAMED
File without changes
worker.py CHANGED
@@ -10,7 +10,7 @@ from typing import Tuple, Dict, Any
10
 
11
  from mineru_single import Processor
12
 
13
- from topic_extraction import MineruNoTextProcessor
14
 
15
  import logging
16
 
@@ -27,10 +27,7 @@ class RabbitMQWorker:
27
  logger.info("Initializing RabbitMQWorker")
28
  self.processor = Processor()
29
 
30
- self.topic_processor = MineruNoTextProcessor(
31
- output_folder="/tmp/topic_extraction_outputs",
32
- gemini_api_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
33
- )
34
 
35
  self.publisher_connection = None
36
  self.publisher_channel = None
@@ -132,35 +129,32 @@ class RabbitMQWorker:
132
  elif pattern == "topic_extraction":
133
  data = body_dict.get("data")
134
  input_files = data.get("input_files")
135
- logger.info("[Worker %s] Found %d file(s) to process for topic extraction.", thread_id, len(input_files))
136
 
137
- topics_contexts = []
138
  for file in input_files:
139
  try:
140
- pdf_url = file.get("url")
141
- logger.info("[Worker %s] Processing topic extraction for URL: %s", thread_id, pdf_url)
142
-
143
- result = self.topic_processor.process(pdf_url)
144
- # result = self.topic_processor.process(pdf_url, inputs={"api_key": os.getenv("GEMINI_API_KEY")})
145
  context = {
146
- "key": file.get("key", ""),
147
- "body": result
148
  }
149
- topics_contexts.append(context)
150
  except Exception as e:
151
- err_str = f"Error processing topic extraction for file {file.get('key', '')}: {e}"
152
  logger.error(err_str)
153
- topics_contexts.append({"key": file.get("key", ""), "body": err_str})
154
- data["topics_markdown"] = topics_contexts
 
155
  body_dict["pattern"] = "topic_extraction_update_from_gpu_server"
156
  body_dict["data"] = data
 
157
  if self.publish_message(body_dict, headers):
158
- logger.info("[Worker %s] Successfully published topic extraction results to ml_server.", thread_id)
159
  ch.basic_ack(delivery_tag=method.delivery_tag)
160
  else:
161
  ch.basic_nack(delivery_tag=method.delivery_tag, requeue=True)
162
-
163
- logger.info("[Worker %s] Contexts: %s", thread_id, contexts)
 
164
 
165
  else:
166
  ch.basic_ack(delivery_tag=method.delivery_tag, requeue=False)
@@ -219,6 +213,4 @@ def main():
219
  worker.start()
220
 
221
  if __name__ == "__main__":
222
- main()
223
-
224
- __all__ = ['main']
 
10
 
11
  from mineru_single import Processor
12
 
13
+ from topic_extr import TopicExtractionProcessor
14
 
15
  import logging
16
 
 
27
  logger.info("Initializing RabbitMQWorker")
28
  self.processor = Processor()
29
 
30
+ self.topic_processor = TopicExtractionProcessor()
 
 
 
31
 
32
  self.publisher_connection = None
33
  self.publisher_channel = None
 
129
  elif pattern == "topic_extraction":
130
  data = body_dict.get("data")
131
  input_files = data.get("input_files")
132
+ logger.info("[Worker %s] Found %d file(s) for topic extraction.", thread_id, len(input_files))
133
 
 
134
  for file in input_files:
135
  try:
 
 
 
 
 
136
  context = {
137
+ "key": file["key"],
138
+ "body": self.topic_processor.process(file)
139
  }
140
+ contexts.append(context)
141
  except Exception as e:
142
+ err_str = f"Error processing file {file.get('key', '')}: {e}"
143
  logger.error(err_str)
144
+ contexts.append({"key": file.get("key", ""), "body": err_str})
145
+
146
+ data["md_context"] = contexts
147
  body_dict["pattern"] = "topic_extraction_update_from_gpu_server"
148
  body_dict["data"] = data
149
+
150
  if self.publish_message(body_dict, headers):
151
+ logger.info("[Worker %s] Published topic extraction results to ml_server.", thread_id)
152
  ch.basic_ack(delivery_tag=method.delivery_tag)
153
  else:
154
  ch.basic_nack(delivery_tag=method.delivery_tag, requeue=True)
155
+ logger.error("[Worker %s] Failed to publish topic results.", thread_id)
156
+
157
+ logger.info("[Worker %s] Topic contexts: %s", thread_id, contexts)
158
 
159
  else:
160
  ch.basic_ack(delivery_tag=method.delivery_tag, requeue=False)
 
213
  worker.start()
214
 
215
  if __name__ == "__main__":
216
+ main()