Overglitch commited on
Commit
78bf8ed
·
verified ·
1 Parent(s): 80e2948

Update modules/preprocessing.py

Browse files
Files changed (1) hide show
  1. modules/preprocessing.py +21 -19
modules/preprocessing.py CHANGED
@@ -97,25 +97,27 @@ class PDFProcessor:
97
  self.ocr_model = ocr_predictor(pretrained=True)
98
  self.max_pages = max_pages
99
 
100
- def pdf_to_text(self, file_path: str) -> str:
101
- """
102
- Convierte un archivo PDF a texto usando OCR.
103
- """
104
- pdf_file = Path(file_path)
105
- doc = DocumentFile.from_pdf(pdf_file)
106
-
107
- # Asegúrate de que `doc` sea un objeto compatible con pages
108
- if isinstance(doc, list):
109
- pages = doc[:self.max_pages] if len(doc) > self.max_pages else doc
110
- elif hasattr(doc, "pages"):
111
- pages = doc.pages[:self.max_pages] if len(doc.pages) > self.max_pages else doc.pages
112
- else:
113
- raise ValueError("Formato inesperado para el documento PDF.")
114
-
115
- raw_text = "\n".join(
116
- [block.text for page in pages for block in page.blocks]
117
- )
118
- return Preprocessor().preprocess_text(raw_text)
 
 
119
 
120
 
121
 
 
97
  self.ocr_model = ocr_predictor(pretrained=True)
98
  self.max_pages = max_pages
99
 
100
+ def pdf_to_text(pdf_path):
101
+ # 1) Cargar el PDF
102
+ doc = DocumentFile.from_pdf(pdf_path)
103
+
104
+ # 2) Crear un predictor (modelo OCR); docTR brinda modelos preentrenados
105
+ predictor = ocr_predictor(pretrained=True)
106
+
107
+ # 3) Aplicar el predictor al documento para obtener el layout
108
+ ocr_result = predictor(doc)
109
+
110
+ # Ahora sí, las páginas tienen .blocks, .lines, etc.
111
+ pages = ocr_result.pages
112
+
113
+ # 4) Extraer el texto de cada bloque
114
+ text_pages = []
115
+ for page in pages:
116
+ for block in page.blocks:
117
+ text_pages.append(block.text)
118
+
119
+ # 5) Unir o procesar a conveniencia
120
+ return "\n".join(text_pages)
121
 
122
 
123