leavoigt commited on
Commit
57b378c
·
verified ·
1 Parent(s): 7e1341a

Update utils/preprocessing.py

Browse files
Files changed (1) hide show
  1. utils/preprocessing.py +36 -20
utils/preprocessing.py CHANGED
@@ -1,7 +1,8 @@
1
  from haystack.nodes.base import BaseComponent
2
  from haystack.schema import Document
3
- from haystack.nodes import PDFToTextOCRConverter, PDFToTextConverter
4
  from haystack.nodes import TextConverter, DocxToTextConverter, PreProcessor
 
5
  from typing import Callable, Dict, List, Optional, Text, Tuple, Union
6
  from typing_extensions import Literal
7
  import pandas as pd
@@ -9,7 +10,9 @@ import logging
9
  import re
10
  import string
11
  from haystack.pipelines import Pipeline
 
12
 
 
13
  def useOCR(file_path: str)-> Text:
14
  """
15
  Converts image pdfs into text, Using the Farm-haystack[OCR]
@@ -21,13 +24,30 @@ def useOCR(file_path: str)-> Text:
21
 
22
  Returns the text file as string.
23
  """
24
-
 
 
 
 
 
 
 
 
25
 
26
- converter = PDFToTextOCRConverter(remove_numeric_tables=True,
27
  valid_languages=["eng"])
28
- docs = converter.convert(file_path=file_path, meta=None)
29
- return docs[0].content
 
 
 
 
30
 
 
 
 
 
 
31
 
32
 
33
 
@@ -37,13 +57,10 @@ class FileConverter(BaseComponent):
37
  Converter class, will use internally haystack PDFToTextOCR in case of image
38
  pdf. Cannot use the FileClassifier from haystack as its doesnt has any
39
  label/output class for image.
40
-
41
  1. https://haystack.deepset.ai/pipeline_nodes/custom-nodes
42
  2. https://docs.haystack.deepset.ai/docs/file_converters
43
  3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/file_converter
44
  4. https://docs.haystack.deepset.ai/reference/file-converters-api
45
-
46
-
47
  """
48
 
49
  outgoing_edges = 1
@@ -84,8 +101,6 @@ class FileConverter(BaseComponent):
84
 
85
  documents = []
86
 
87
-
88
- # encoding is empty, probably should be utf-8
89
  document = converter.convert(
90
  file_path=file_path, meta=None,
91
  encoding=encoding, id_hash_keys=id_hash_keys
@@ -101,10 +116,12 @@ class FileConverter(BaseComponent):
101
  if filtered == "":
102
  logging.info("Using OCR")
103
  text = useOCR(file_path)
104
-
105
  documents.append(Document(content=text,
106
  meta={"name": file_name},
107
  id_hash_keys=id_hash_keys))
 
 
108
 
109
  logging.info('file conversion succesful')
110
  output = {'documents': documents}
@@ -124,7 +141,6 @@ def basic(s:str, remove_punc:bool = False):
124
 
125
  """
126
  Performs basic cleaning of text.
127
-
128
  Params
129
  ----------
130
  s: string to be processed
@@ -150,6 +166,7 @@ def basic(s:str, remove_punc:bool = False):
150
 
151
  return s.strip()
152
 
 
153
  def paraLengthCheck(paraList, max_len = 100):
154
  """
155
  There are cases where preprocessor cannot respect word limit, when using
@@ -187,15 +204,13 @@ class UdfPreProcessor(BaseComponent):
187
  class to preprocess the document returned by FileConverter. It will check
188
  for splitting strategy and splits the document by word or sentences and then
189
  synthetically create the paragraphs.
190
-
191
  1. https://docs.haystack.deepset.ai/docs/preprocessor
192
  2. https://docs.haystack.deepset.ai/reference/preprocessor-api
193
  3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/preprocessor
194
-
195
  """
196
  outgoing_edges = 1
197
 
198
- def run(self, documents:List[Document], remove_punc:bool=False,
199
  split_by: Literal["sentence", "word"] = 'sentence',
200
  split_length:int = 2, split_respect_sentence_boundary:bool = False,
201
  split_overlap:int = 0):
@@ -250,8 +265,11 @@ class UdfPreProcessor(BaseComponent):
250
  # # basic cleaning before passing it to preprocessor.
251
  # i = basic(i)
252
  docs_processed = preprocessor.process([i])
253
- for item in docs_processed:
254
- item.content = basic(item.content, remove_punc= remove_punc)
 
 
 
255
 
256
  df = pd.DataFrame(docs_processed)
257
  all_text = " ".join(df.content.to_list())
@@ -275,7 +293,6 @@ def processingpipeline():
275
  """
276
  Returns the preprocessing pipeline. Will use FileConverter and UdfPreProcesor
277
  from utils.preprocessing
278
-
279
  """
280
 
281
  preprocessing_pipeline = Pipeline()
@@ -287,5 +304,4 @@ def processingpipeline():
287
  preprocessing_pipeline.add_node(component = custom_preprocessor,
288
  name ='UdfPreProcessor', inputs=["FileConverter"])
289
 
290
- return preprocessing_pipeline
291
-
 
1
  from haystack.nodes.base import BaseComponent
2
  from haystack.schema import Document
3
+ from haystack.nodes import ImageToTextConverter, PDFToTextConverter
4
  from haystack.nodes import TextConverter, DocxToTextConverter, PreProcessor
5
+ from pdf2image import convert_from_path
6
  from typing import Callable, Dict, List, Optional, Text, Tuple, Union
7
  from typing_extensions import Literal
8
  import pandas as pd
 
10
  import re
11
  import string
12
  from haystack.pipelines import Pipeline
13
+ import streamlit as st
14
 
15
+ @st.cache_data
16
  def useOCR(file_path: str)-> Text:
17
  """
18
  Converts image pdfs into text, Using the Farm-haystack[OCR]
 
24
 
25
  Returns the text file as string.
26
  """
27
+ # we need pdf file to be first converted into image file
28
+ # this will create each page as image file
29
+ images = convert_from_path(pdf_path = file_path)
30
+ list_ = []
31
+ # save image file in cache and read them one by one to pass it to OCR
32
+ for i, pdf in enumerate(images):
33
+ # Save pages as images in the pdf
34
+ pdf.save(f'PDF\image_converted_{i+1}.png', 'PNG')
35
+ list_.append(f'PDF\image_converted_{i+1}.png')
36
 
37
+ converter = ImageToTextConverter(remove_numeric_tables=True,
38
  valid_languages=["eng"])
39
+ # placeholder to collect the text from each page
40
+ placeholder = []
41
+ for file in list_:
42
+ document = converter.convert(
43
+ file_path=file, meta=None,
44
+ )[0]
45
 
46
+ text = document.content
47
+ placeholder.append(text)
48
+ # join the text from each page by page separator
49
+ text = '\x0c'.join(placeholder)
50
+ return text
51
 
52
 
53
 
 
57
  Converter class, will use internally haystack PDFToTextOCR in case of image
58
  pdf. Cannot use the FileClassifier from haystack as its doesnt has any
59
  label/output class for image.
 
60
  1. https://haystack.deepset.ai/pipeline_nodes/custom-nodes
61
  2. https://docs.haystack.deepset.ai/docs/file_converters
62
  3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/file_converter
63
  4. https://docs.haystack.deepset.ai/reference/file-converters-api
 
 
64
  """
65
 
66
  outgoing_edges = 1
 
101
 
102
  documents = []
103
 
 
 
104
  document = converter.convert(
105
  file_path=file_path, meta=None,
106
  encoding=encoding, id_hash_keys=id_hash_keys
 
116
  if filtered == "":
117
  logging.info("Using OCR")
118
  text = useOCR(file_path)
119
+
120
  documents.append(Document(content=text,
121
  meta={"name": file_name},
122
  id_hash_keys=id_hash_keys))
123
+
124
+
125
 
126
  logging.info('file conversion succesful')
127
  output = {'documents': documents}
 
141
 
142
  """
143
  Performs basic cleaning of text.
 
144
  Params
145
  ----------
146
  s: string to be processed
 
166
 
167
  return s.strip()
168
 
169
+
170
  def paraLengthCheck(paraList, max_len = 100):
171
  """
172
  There are cases where preprocessor cannot respect word limit, when using
 
204
  class to preprocess the document returned by FileConverter. It will check
205
  for splitting strategy and splits the document by word or sentences and then
206
  synthetically create the paragraphs.
 
207
  1. https://docs.haystack.deepset.ai/docs/preprocessor
208
  2. https://docs.haystack.deepset.ai/reference/preprocessor-api
209
  3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/preprocessor
 
210
  """
211
  outgoing_edges = 1
212
 
213
+ def run(self, documents:List[Document], remove_punc:bool=False, apply_clean = True,
214
  split_by: Literal["sentence", "word"] = 'sentence',
215
  split_length:int = 2, split_respect_sentence_boundary:bool = False,
216
  split_overlap:int = 0):
 
265
  # # basic cleaning before passing it to preprocessor.
266
  # i = basic(i)
267
  docs_processed = preprocessor.process([i])
268
+ if apply_clean:
269
+ for item in docs_processed:
270
+ item.content = basic(item.content, remove_punc= remove_punc)
271
+ else:
272
+ pass
273
 
274
  df = pd.DataFrame(docs_processed)
275
  all_text = " ".join(df.content.to_list())
 
293
  """
294
  Returns the preprocessing pipeline. Will use FileConverter and UdfPreProcesor
295
  from utils.preprocessing
 
296
  """
297
 
298
  preprocessing_pipeline = Pipeline()
 
304
  preprocessing_pipeline.add_node(component = custom_preprocessor,
305
  name ='UdfPreProcessor', inputs=["FileConverter"])
306
 
307
+ return preprocessing_pipeline