SkyNait commited on
Commit
1116a38
·
verified ·
1 Parent(s): 7d41757

Add logging

Browse files
Files changed (1) hide show
  1. mineru_single.py +122 -97
mineru_single.py CHANGED
@@ -1,98 +1,123 @@
1
- #!/usr/bin/env python3
2
- import os
3
- import uuid
4
- import json
5
- import requests
6
- from loguru import logger
7
-
8
- from magic_pdf.data.dataset import PymuDocDataset
9
- from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
10
- from magic_pdf.data.io.s3 import S3Writer
11
- from magic_pdf.data.data_reader_writer.base import DataWriter
12
-
13
- from inference_svm_model import SVMModel
14
-
15
- class Processor:
16
- def __init__(self):
17
- self.s3_writer = S3Writer(
18
- ak=os.getenv("S3_ACCESS_KEY"),
19
- sk=os.getenv("S3_SECRET_KEY"),
20
- bucket=os.getenv("S3_BUCKET_NAME"),
21
- endpoint_url=os.getenv("S3_ENDPOINT"),
22
- )
23
-
24
- self.svm_model = SVMModel()
25
-
26
- with open("/home/user/magic-pdf.json", "r") as f:
27
- config = json.load(f)
28
-
29
- self.layout_mode = config["layout-config"]["model"]
30
- self.formula_enable = config["formula-config"]["enable"]
31
- self.table_enable = config["table-config"]["enable"]
32
- self.language = "en"
33
-
34
- endpoint = os.getenv("S3_ENDPOINT", "").rstrip("/")
35
- bucket = os.getenv("S3_BUCKET_NAME", "")
36
- self.prefix = f"{endpoint}/{bucket}/document-extracts/"
37
-
38
- def process(self, file_url: str, key: str) -> str:
39
- logger.info("Processing file: {}", file_url)
40
- response = requests.get(file_url)
41
- if response.status_code != 200:
42
- raise Exception(f"Failed to download PDF: {file_url}")
43
- pdf_bytes = response.content
44
-
45
- dataset = PymuDocDataset(pdf_bytes)
46
- inference = doc_analyze(
47
- dataset,
48
- ocr=True,
49
- lang=self.language,
50
- layout_model=self.layout_mode,
51
- formula_enable=self.formula_enable,
52
- table_enable=self.table_enable
53
- )
54
-
55
- image_writer = ImageWriter(self.s3_writer, self.svm_model)
56
-
57
- pipe_result = inference.pipe_ocr_mode(image_writer, lang=self.language)
58
-
59
- md_content = pipe_result.get_markdown(self.prefix + key + "/")
60
-
61
- # Remove references to images classified as "irrelevant"
62
- final_markdown = image_writer.remove_redundant_images(md_content)
63
- return final_markdown
64
-
65
- class ImageWriter(DataWriter):
66
- """
67
- Receives each extracted image. Classifies it, uploads if relevant, or flags
68
- it for removal if irrelevant.
69
- """
70
- def __init__(self, s3_writer: S3Writer, svm_model: SVMModel):
71
- self.s3_writer = s3_writer
72
- self.svm_model = svm_model
73
- self._redundant_images_paths = []
74
-
75
- def write(self, path: str, data: bytes) -> None:
76
- label_str = self.svm_model.classify_image(data)
77
-
78
- if label_str == 1:
79
- # Upload to S3
80
- self.s3_writer.write(path, data)
81
- else:
82
- self._redundant_images_paths.append(path)
83
-
84
- def remove_redundant_images(self, md_content: str) -> str:
85
- for path in self._redundant_images_paths:
86
- md_content = md_content.replace(f"![]({path})", "")
87
- return md_content
88
-
89
- if __name__ == "__main__":
90
- processor = Processor()
91
-
92
- single_url = "https://example.com/somefile.pdf"
93
- markdown_result = processor.process(single_url)
94
- print("Single file Markdown:\n", markdown_result)
95
-
96
- multiple_urls = ["https://example.com/file1.pdf", "https://example.com/file2.pdf"]
97
- batch_results = processor.process_batch(multiple_urls)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  print("Batch results:", batch_results)
 
1
+ #!/usr/bin/env python3
2
+ import os
3
+ import uuid
4
+ import json
5
+ import requests
6
+ import logging
7
+
8
+ from magic_pdf.data.dataset import PymuDocDataset
9
+ from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
10
+ from magic_pdf.data.io.s3 import S3Writer
11
+ from magic_pdf.data.data_reader_writer.base import DataWriter
12
+
13
+ from inference_svm_model import SVMModel
14
+
15
+ logging.basicConfig(
16
+ level=logging.INFO,
17
+ format="%(asctime)s [%(levelname)s] %(name)s - %(message)s"
18
+ )
19
+ logger = logging.getLogger(__name__)
20
+
21
+ class Processor:
22
+ def __init__(self):
23
+ try:
24
+ self.s3_writer = S3Writer(
25
+ ak=os.getenv("S3_ACCESS_KEY"),
26
+ sk=os.getenv("S3_SECRET_KEY"),
27
+ bucket=os.getenv("S3_BUCKET_NAME"),
28
+ endpoint_url=os.getenv("S3_ENDPOINT"),
29
+ )
30
+
31
+ self.svm_model = SVMModel()
32
+ logger.info("Classification model initialized successfully")
33
+
34
+ with open("/home/user/magic-pdf.json", "r") as f:
35
+ config = json.load(f)
36
+
37
+ self.layout_mode = config["layout-config"]["model"]
38
+ self.formula_enable = config["formula-config"]["enable"]
39
+ self.table_enable = config["table-config"]["enable"]
40
+ self.language = "en"
41
+
42
+ endpoint = os.getenv("S3_ENDPOINT", "").rstrip("/")
43
+ bucket = os.getenv("S3_BUCKET_NAME", "")
44
+ self.prefix = f"{endpoint}/{bucket}/document-extracts/"
45
+
46
+ logger.info("Processor initialized successfully")
47
+ except Exception as e:
48
+ logger.error("Failed to initialize Processor: %s", str(e))
49
+ raise
50
+
51
+ def process(self, file_url: str, key: str) -> str:
52
+ """
53
+ Process a single PDF, returning final Markdown with irrelevant images removed.
54
+ """
55
+ logger.info("Processing file: %s", file_url)
56
+
57
+ response = requests.get(file_url)
58
+ if response.status_code != 200:
59
+ logger.error("Failed to download PDF from %s. Status code: %d", file_url, response.status_code)
60
+ raise Exception(f"Failed to download PDF: {file_url}")
61
+
62
+ pdf_bytes = response.content
63
+ logger.info("Downloaded %d bytes for file_url='%s'", len(pdf_bytes), file_url)
64
+
65
+ # Analyze PDF with OCR
66
+ dataset = PymuDocDataset(pdf_bytes)
67
+ inference = doc_analyze(
68
+ dataset,
69
+ ocr=True,
70
+ lang=self.language,
71
+ layout_model=self.layout_mode,
72
+ formula_enable=self.formula_enable,
73
+ table_enable=self.table_enable
74
+ )
75
+ logger.info("doc_analyze complete for key='%s'. Started to extracting images...", key)
76
+
77
+ # Classify images, remove irrelevant
78
+ image_writer = ImageWriter(self.s3_writer, self.svm_model)
79
+ pipe_result = inference.pipe_ocr_mode(image_writer, lang=self.language)
80
+
81
+ md_content = pipe_result.get_markdown(self.prefix + key + "/")
82
+ final_markdown = image_writer.remove_redundant_images(md_content)
83
+ logger.info("Completed PDF process for key='%s'. Final MD length=%d", key, len(final_markdown))
84
+ return final_markdown
85
+
86
+ class ImageWriter(DataWriter):
87
+ """
88
+ Receives each extracted image. Classifies it, uploads if relevant, or flags
89
+ it for removal if irrelevant.
90
+ """
91
+ def __init__(self, s3_writer: S3Writer, svm_model: SVMModel):
92
+ self.s3_writer = s3_writer
93
+ self.svm_model = svm_model
94
+ self._redundant_images_paths = []
95
+
96
+ def write(self, path: str, data: bytes) -> None:
97
+ """
98
+ Called for each extracted image. If relevant, upload to S3; otherwise mark for removal.
99
+ """
100
+ label_str = self.svm_model.classify_image(data)
101
+
102
+ if label_str == 1:
103
+ self.logger.info("Image is relevant. Uploaded to S3. Path='%s'.", path)
104
+ self.s3_writer.write(path, data)
105
+ else:
106
+ self.logger.info("Marked image at path='%s' as irrelevant. Will be removed from Markdown.", path)
107
+ self._redundant_images_paths.append(path)
108
+
109
+ def remove_redundant_images(self, md_content: str) -> str:
110
+ for path in self._redundant_images_paths:
111
+ md_content = md_content.replace(f"![]({path})", "")
112
+ return md_content
113
+
114
+ if __name__ == "__main__":
115
+ processor = Processor()
116
+
117
+ single_url = "https://example.com/somefile.pdf"
118
+ markdown_result = processor.process(single_url)
119
+ print("Single file Markdown:\n", markdown_result)
120
+
121
+ multiple_urls = ["https://example.com/file1.pdf", "https://example.com/file2.pdf"]
122
+ batch_results = processor.process_batch(multiple_urls)
123
  print("Batch results:", batch_results)