Overglitch commited on
Commit
f3fc1b2
·
verified ·
1 Parent(s): e48356e

Update modules/preprocessing.py

Browse files
Files changed (1) hide show
  1. modules/preprocessing.py +11 -5
modules/preprocessing.py CHANGED
@@ -103,16 +103,22 @@ class PDFProcessor:
103
  """
104
  pdf_file = Path(file_path)
105
  doc = DocumentFile.from_pdf(pdf_file)
106
- if len(doc.pages) > self.max_pages:
107
- doc.pages = doc.pages[:self.max_pages]
108
-
109
- result = self.ocr_model(doc)
 
 
 
 
 
110
  raw_text = "\n".join(
111
- [block.text for page in result.pages for block in page.blocks]
112
  )
113
  return Preprocessor().preprocess_text(raw_text)
114
 
115
 
 
116
  class FileHandler:
117
  """Clase para manejar archivos temporales y limpieza."""
118
 
 
103
  """
104
  pdf_file = Path(file_path)
105
  doc = DocumentFile.from_pdf(pdf_file)
106
+
107
+ # Asegúrate de que `doc` sea un objeto compatible con pages
108
+ if isinstance(doc, list):
109
+ pages = doc[:self.max_pages] if len(doc) > self.max_pages else doc
110
+ elif hasattr(doc, "pages"):
111
+ pages = doc.pages[:self.max_pages] if len(doc.pages) > self.max_pages else doc.pages
112
+ else:
113
+ raise ValueError("Formato inesperado para el documento PDF.")
114
+
115
  raw_text = "\n".join(
116
+ [block.text for page in pages for block in page.blocks]
117
  )
118
  return Preprocessor().preprocess_text(raw_text)
119
 
120
 
121
+
122
  class FileHandler:
123
  """Clase para manejar archivos temporales y limpieza."""
124