visualizingjp pszemraj commited on
Commit
fa95d76
·
0 Parent(s):

Duplicate from pszemraj/pdf-ocr

Browse files

Co-authored-by: Peter Szemraj <[email protected]>

Files changed (7) hide show
  1. .gitattributes +31 -0
  2. .gitignore +29 -0
  3. README.md +14 -0
  4. app.py +182 -0
  5. example_file.pdf +0 -0
  6. pdf2text.py +403 -0
  7. requirements.txt +9 -0
.gitattributes ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.model filter=lfs diff=lfs merge=lfs -text
11
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
12
+ *.npy filter=lfs diff=lfs merge=lfs -text
13
+ *.npz filter=lfs diff=lfs merge=lfs -text
14
+ *.onnx filter=lfs diff=lfs merge=lfs -text
15
+ *.ot filter=lfs diff=lfs merge=lfs -text
16
+ *.parquet filter=lfs diff=lfs merge=lfs -text
17
+ *.pickle filter=lfs diff=lfs merge=lfs -text
18
+ *.pkl filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pt filter=lfs diff=lfs merge=lfs -text
21
+ *.pth filter=lfs diff=lfs merge=lfs -text
22
+ *.rar filter=lfs diff=lfs merge=lfs -text
23
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
24
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
25
+ *.tflite filter=lfs diff=lfs merge=lfs -text
26
+ *.tgz filter=lfs diff=lfs merge=lfs -text
27
+ *.wasm filter=lfs diff=lfs merge=lfs -text
28
+ *.xz filter=lfs diff=lfs merge=lfs -text
29
+ *.zip filter=lfs diff=lfs merge=lfs -text
30
+ *.zst filter=lfs diff=lfs merge=lfs -text
31
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # logs
2
+ *.log
3
+ *LOGFILE*
4
+
5
+ # output files need to be force-added
6
+ *.csv
7
+ *.png
8
+ *.jpg
9
+ *.jpeg
10
+ *.pkl
11
+ *.xlsx
12
+ *.txt
13
+
14
+ # cache
15
+ *__pycache__/
16
+ *.pyc
17
+
18
+ # reports folder - need to be force-added
19
+ *reports/
20
+
21
+ # scratch files and folders
22
+
23
+ *scratch*
24
+ *scratch/
25
+
26
+ # notebooks
27
+
28
+ *notebooks/
29
+ *.ipynb
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: PDF OCR
3
+ emoji: 📝📎
4
+ colorFrom: orange
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 3.4
8
+ app_file: app.py
9
+ pinned: false
10
+ license: gpl-3.0
11
+ duplicated_from: pszemraj/pdf-ocr
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import time
3
+ from pathlib import Path
4
+ import contextlib
5
+
6
+ logging.basicConfig(
7
+ level=logging.INFO,
8
+ format="%(asctime)s - %(levelname)s - %(message)s",
9
+ )
10
+
11
+
12
+ import gradio as gr
13
+ import nltk
14
+ import torch
15
+
16
+ from pdf2text import *
17
+
18
+ _here = Path(__file__).parent
19
+
20
+ nltk.download("stopwords") # TODO=find where this requirement originates from
21
+
22
+
23
+ def load_uploaded_file(file_obj, temp_dir: Path = None):
24
+ """
25
+ load_uploaded_file - process an uploaded file
26
+
27
+ Args:
28
+ file_obj (POTENTIALLY list): Gradio file object inside a list
29
+
30
+ Returns:
31
+ str, the uploaded file contents
32
+ """
33
+
34
+ # check if mysterious file object is a list
35
+ if isinstance(file_obj, list):
36
+ file_obj = file_obj[0]
37
+ file_path = Path(file_obj.name)
38
+
39
+ if temp_dir is None:
40
+ _temp_dir = _here / "temp"
41
+ _temp_dir.mkdir(exist_ok=True)
42
+
43
+ try:
44
+ pdf_bytes_obj = open(file_path, "rb").read()
45
+ temp_path = temp_dir / file_path.name if temp_dir else file_path
46
+ # save to PDF file
47
+ with open(temp_path, "wb") as f:
48
+ f.write(pdf_bytes_obj)
49
+ logging.info(f"Saved uploaded file to {temp_path}")
50
+ return str(temp_path.resolve())
51
+
52
+ except Exception as e:
53
+ logging.error(f"Trying to load file with path {file_path}, error: {e}")
54
+ print(f"Trying to load file with path {file_path}, error: {e}")
55
+ return None
56
+
57
+
58
+ def convert_PDF(
59
+ pdf_obj,
60
+ language: str = "en",
61
+ max_pages=20,
62
+ ):
63
+ """
64
+ convert_PDF - convert a PDF file to text
65
+
66
+ Args:
67
+ pdf_bytes_obj (bytes): PDF file contents
68
+ language (str, optional): Language to use for OCR. Defaults to "en".
69
+
70
+ Returns:
71
+ str, the PDF file contents as text
72
+ """
73
+ # clear local text cache
74
+ rm_local_text_files()
75
+ global ocr_model
76
+ st = time.perf_counter()
77
+ if isinstance(pdf_obj, list):
78
+ pdf_obj = pdf_obj[0]
79
+ file_path = Path(pdf_obj.name)
80
+ if not file_path.suffix == ".pdf":
81
+ logging.error(f"File {file_path} is not a PDF file")
82
+
83
+ html_error = f"""
84
+ <div style="color: red; font-size: 20px; font-weight: bold;">
85
+ File {file_path} is not a PDF file. Please upload a PDF file.
86
+ </div>
87
+ """
88
+ return "File is not a PDF file", html_error, None
89
+
90
+ conversion_stats = convert_PDF_to_Text(
91
+ file_path,
92
+ ocr_model=ocr_model,
93
+ max_pages=max_pages,
94
+ )
95
+ converted_txt = conversion_stats["converted_text"]
96
+ num_pages = conversion_stats["num_pages"]
97
+ was_truncated = conversion_stats["truncated"]
98
+ # if alt_lang: # TODO: fix this
99
+
100
+ rt = round((time.perf_counter() - st) / 60, 2)
101
+ print(f"Runtime: {rt} minutes")
102
+ html = ""
103
+ if was_truncated:
104
+ html += f"<p>WARNING - PDF was truncated to {max_pages} pages</p>"
105
+ html += f"<p>Runtime: {rt} minutes on CPU for {num_pages} pages</p>"
106
+
107
+ _output_name = f"RESULT_{file_path.stem}_OCR.txt"
108
+ with open(_output_name, "w", encoding="utf-8", errors="ignore") as f:
109
+ f.write(converted_txt)
110
+
111
+ return converted_txt, html, _output_name
112
+
113
+
114
+ if __name__ == "__main__":
115
+ logging.info("Starting app")
116
+
117
+ use_GPU = torch.cuda.is_available()
118
+ logging.info(f"Using GPU status: {use_GPU}")
119
+ logging.info("Loading OCR model")
120
+ with contextlib.redirect_stdout(None):
121
+ ocr_model = ocr_predictor(
122
+ "db_resnet50",
123
+ "crnn_mobilenet_v3_large",
124
+ pretrained=True,
125
+ assume_straight_pages=True,
126
+ )
127
+
128
+ # define pdf bytes as None
129
+ pdf_obj = _here / "example_file.pdf"
130
+ pdf_obj = str(pdf_obj.resolve())
131
+ _temp_dir = _here / "temp"
132
+ _temp_dir.mkdir(exist_ok=True)
133
+
134
+ logging.info("starting demo")
135
+ demo = gr.Blocks()
136
+
137
+ with demo:
138
+
139
+ gr.Markdown("# PDF to Text")
140
+ gr.Markdown(
141
+ "A basic demo of pdf-to-text conversion using OCR from the [doctr](https://mindee.github.io/doctr/index.html) package"
142
+ )
143
+ gr.Markdown("---")
144
+
145
+ with gr.Column():
146
+
147
+ gr.Markdown("## Load Inputs")
148
+ gr.Markdown("Upload your own file & replace the default. Files should be < 10MB to avoid upload issues - search for a PDF compressor online as needed.")
149
+ gr.Markdown(
150
+ "_If no file is uploaded, a sample PDF will be used. PDFs are truncated to 20 pages._"
151
+ )
152
+
153
+ uploaded_file = gr.File(
154
+ label="Upload a PDF file",
155
+ file_count="single",
156
+ type="file",
157
+ value=_here / "example_file.pdf",
158
+ )
159
+
160
+ gr.Markdown("---")
161
+
162
+ with gr.Column():
163
+ gr.Markdown("## Convert PDF to Text")
164
+ convert_button = gr.Button("Convert PDF!", variant="primary")
165
+ out_placeholder = gr.HTML("<p><em>Output will appear below:</em></p>")
166
+ gr.Markdown("### Output")
167
+ OCR_text = gr.Textbox(
168
+ label="OCR Result", placeholder="The OCR text will appear here"
169
+ )
170
+ text_file = gr.File(
171
+ label="Download Text File",
172
+ file_count="single",
173
+ type="file",
174
+ interactive=False,
175
+ )
176
+
177
+ convert_button.click(
178
+ fn=convert_PDF,
179
+ inputs=[uploaded_file],
180
+ outputs=[OCR_text, out_placeholder, text_file],
181
+ )
182
+ demo.launch(enable_queue=True)
example_file.pdf ADDED
Binary file (290 kB). View file
 
pdf2text.py ADDED
@@ -0,0 +1,403 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+
4
+ easyocr.py - A wrapper for easyocr to convert pdf to images to text
5
+ """
6
+
7
+ import logging
8
+ from pathlib import Path
9
+
10
+ logging.basicConfig(
11
+ level=logging.INFO,
12
+ format="%(asctime)s %(levelname)s %(message)s",
13
+ datefmt="%m/%d/%Y %I:%M:%S",
14
+ )
15
+
16
+
17
+ import os
18
+ import pprint as pp
19
+ import re
20
+ import shutil
21
+ import time
22
+ from datetime import date, datetime
23
+ from os.path import basename, dirname, join
24
+ from pathlib import Path
25
+
26
+ from cleantext import clean
27
+ from doctr.io import DocumentFile
28
+ from doctr.models import ocr_predictor
29
+ from libretranslatepy import LibreTranslateAPI
30
+ from natsort import natsorted
31
+ from spellchecker import SpellChecker
32
+ from tqdm.auto import tqdm
33
+
34
+
35
+ def simple_rename(filepath, target_ext=".txt"):
36
+ _fp = Path(filepath)
37
+ basename = _fp.stem
38
+ return f"OCR_{basename}_{target_ext}"
39
+
40
+
41
+ def rm_local_text_files(name_contains="RESULT_"):
42
+ """
43
+ rm_local_text_files - remove local text files
44
+
45
+ Args:
46
+ name_contains (str, optional): [description]. Defaults to "OCR_".
47
+ """
48
+ files = [
49
+ f
50
+ for f in Path.cwd().iterdir()
51
+ if f.is_file() and f.suffix == ".txt" and name_contains in f.name
52
+ ]
53
+ logging.info(f"removing {len(files)} text files")
54
+ for f in files:
55
+ os.remove(f)
56
+ logging.info("done")
57
+
58
+
59
+ def corr(
60
+ s: str,
61
+ add_space_when_numerics=False,
62
+ exceptions=["e.g.", "i.e.", "etc.", "cf.", "vs.", "p."],
63
+ ) -> str:
64
+ """corrects spacing in a string
65
+
66
+ Args:
67
+ s (str): the string to correct
68
+ add_space_when_numerics (bool, optional): [add a space when a period is between two numbers, example 5.73]. Defaults to False.
69
+ exceptions (list, optional): [do not change these substrings]. Defaults to ['e.g.', 'i.e.', 'etc.', 'cf.', 'vs.', 'p.'].
70
+
71
+ Returns:
72
+ str: the corrected string
73
+ """
74
+ if add_space_when_numerics:
75
+ s = re.sub(r"(\d)\.(\d)", r"\1. \2", s)
76
+
77
+ s = re.sub(r"\s+", " ", s)
78
+ s = re.sub(r'\s([?.!"](?:\s|$))', r"\1", s)
79
+
80
+ # fix space before apostrophe
81
+ s = re.sub(r"\s\'", r"'", s)
82
+ # fix space after apostrophe
83
+ s = re.sub(r"'\s", r"'", s)
84
+ # fix space before comma
85
+ s = re.sub(r"\s,", r",", s)
86
+
87
+ for e in exceptions:
88
+ expected_sub = re.sub(r"\s", "", e)
89
+ s = s.replace(expected_sub, e)
90
+
91
+ return s
92
+
93
+
94
+ def fix_punct_spaces(string):
95
+ """
96
+ fix_punct_spaces - replace spaces around punctuation with punctuation. For example, "hello , there" -> "hello, there"
97
+
98
+ Parameters
99
+ ----------
100
+ string : str, required, input string to be corrected
101
+
102
+ Returns
103
+ -------
104
+ str, corrected string
105
+ """
106
+
107
+ fix_spaces = re.compile(r"\s*([?!.,]+(?:\s+[?!.,]+)*)\s*")
108
+ string = fix_spaces.sub(lambda x: "{} ".format(x.group(1).replace(" ", "")), string)
109
+ string = string.replace(" ' ", "'")
110
+ string = string.replace(' " ', '"')
111
+ return string.strip()
112
+
113
+
114
+ def clean_OCR(ugly_text: str):
115
+ """
116
+ clean_OCR - clean the OCR text files.
117
+
118
+ Parameters
119
+ ----------
120
+ ugly_text : str, required, input string to be cleaned
121
+
122
+ Returns
123
+ -------
124
+ str, cleaned string
125
+ """
126
+ # Remove all the newlines.
127
+ cleaned_text = ugly_text.replace("\n", " ")
128
+ # Remove all the tabs.
129
+ cleaned_text = cleaned_text.replace("\t", " ")
130
+ # Remove all the double spaces.
131
+ cleaned_text = cleaned_text.replace(" ", " ")
132
+ # Remove all the spaces at the beginning of the text.
133
+ cleaned_text = cleaned_text.lstrip()
134
+ # remove all instances of "- " and " - "
135
+ cleaned_text = cleaned_text.replace("- ", "")
136
+ cleaned_text = cleaned_text.replace(" -", "")
137
+ return fix_punct_spaces(cleaned_text)
138
+
139
+
140
+ def move2completed(from_dir, filename, new_folder="completed", verbose=False):
141
+
142
+ # this is the better version
143
+ old_filepath = join(from_dir, filename)
144
+
145
+ new_filedirectory = join(from_dir, new_folder)
146
+
147
+ if not os.path.isdir(new_filedirectory):
148
+ os.mkdir(new_filedirectory)
149
+ if verbose:
150
+ print("created new directory for files at: \n", new_filedirectory)
151
+ new_filepath = join(new_filedirectory, filename)
152
+
153
+ try:
154
+ shutil.move(old_filepath, new_filepath)
155
+ logging.info("successfully moved the file {} to */completed.".format(filename))
156
+ except:
157
+ logging.info(
158
+ "ERROR! unable to move file to \n{}. Please investigate".format(
159
+ new_filepath
160
+ )
161
+ )
162
+
163
+
164
+ """## pdf2text functions
165
+
166
+ """
167
+
168
+
169
+ custom_replace_list = {
170
+ "t0": "to",
171
+ "'$": "'s",
172
+ ",,": ", ",
173
+ "_ ": " ",
174
+ " '": "'",
175
+ }
176
+
177
+ replace_corr_exceptions = {
178
+ "i. e.": "i.e.",
179
+ "e. g.": "e.g.",
180
+ "e. g": "e.g.",
181
+ " ,": ",",
182
+ }
183
+
184
+
185
+ spell = SpellChecker()
186
+
187
+
188
+ def check_word_spelling(word: str) -> bool:
189
+ """
190
+ check_word_spelling - check the spelling of a word
191
+
192
+ Args:
193
+ word (str): word to check
194
+
195
+ Returns:
196
+ bool: True if word is spelled correctly, False if not
197
+ """
198
+
199
+ misspelled = spell.unknown([word])
200
+
201
+ return len(misspelled) == 0
202
+
203
+
204
+ def eval_and_replace(text: str, match_token: str = "- ") -> str:
205
+ """
206
+ eval_and_replace - conditionally replace all instances of a substring in a string based on whether the eliminated substring results in a valid word
207
+
208
+ Args:
209
+ text (str): text to evaluate
210
+ match_token (str, optional): token to replace. Defaults to "- ".
211
+
212
+ Returns:
213
+ str: text with replaced tokens
214
+ """
215
+
216
+ if match_token not in text:
217
+ return text
218
+ else:
219
+ while True:
220
+ full_before_text = text.split(match_token, maxsplit=1)[0]
221
+ before_text = [
222
+ char for char in full_before_text.split()[-1] if char.isalpha()
223
+ ]
224
+ before_text = "".join(before_text)
225
+ full_after_text = text.split(match_token, maxsplit=1)[-1]
226
+ after_text = [char for char in full_after_text.split()[0] if char.isalpha()]
227
+ after_text = "".join(after_text)
228
+ full_text = before_text + after_text
229
+ if check_word_spelling(full_text):
230
+ text = full_before_text + full_after_text
231
+ else:
232
+ text = full_before_text + " " + full_after_text
233
+ if match_token not in text:
234
+ break
235
+ return text
236
+
237
+
238
+ def cleantxt_ocr(ugly_text, lower=False, lang: str = "en") -> str:
239
+ """
240
+ cleantxt_ocr - clean text from OCR
241
+
242
+ Args:
243
+ ugly_text (str): text to clean
244
+ lower (bool, optional): _description_. Defaults to False.
245
+ lang (str, optional): _description_. Defaults to "en".
246
+
247
+ Returns:
248
+ str: cleaned text
249
+ """
250
+ # a wrapper for clean text with options different than default
251
+
252
+ # https://pypi.org/project/clean-text/
253
+ cleaned_text = clean(
254
+ ugly_text,
255
+ fix_unicode=True, # fix various unicode errors
256
+ to_ascii=True, # transliterate to closest ASCII representation
257
+ lower=lower, # lowercase text
258
+ no_line_breaks=True, # fully strip line breaks as opposed to only normalizing them
259
+ no_urls=True, # replace all URLs with a special token
260
+ no_emails=True, # replace all email addresses with a special token
261
+ no_phone_numbers=False, # replace all phone numbers with a special token
262
+ no_numbers=False, # replace all numbers with a special token
263
+ no_digits=False, # replace all digits with a special token
264
+ no_currency_symbols=False, # replace all currency symbols with a special token
265
+ no_punct=False, # remove punctuations
266
+ replace_with_punct="", # instead of removing punctuations you may replace them
267
+ replace_with_url="<URL>",
268
+ replace_with_email="<EMAIL>",
269
+ replace_with_phone_number="<PHONE>",
270
+ replace_with_number="<NUM>",
271
+ replace_with_digit="0",
272
+ replace_with_currency_symbol="<CUR>",
273
+ lang=lang, # set to 'de' for German special handling
274
+ )
275
+
276
+ return cleaned_text
277
+
278
+
279
+ def format_ocr_out(OCR_data):
280
+
281
+ if isinstance(OCR_data, list):
282
+ text = " ".join(OCR_data)
283
+ else:
284
+ text = str(OCR_data)
285
+ _clean = cleantxt_ocr(text)
286
+ return corr(_clean)
287
+
288
+
289
+ def postprocess(text: str) -> str:
290
+ """to be used after recombining the lines"""
291
+
292
+ proc = corr(cleantxt_ocr(text))
293
+
294
+ for k, v in custom_replace_list.items():
295
+ proc = proc.replace(str(k), str(v))
296
+
297
+ proc = corr(proc)
298
+
299
+ for k, v in replace_corr_exceptions.items():
300
+ proc = proc.replace(str(k), str(v))
301
+
302
+ return eval_and_replace(proc)
303
+
304
+
305
+ def result2text(result, as_text=False) -> str or list:
306
+ """Convert OCR result to text"""
307
+
308
+ full_doc = []
309
+ for i, page in enumerate(result.pages, start=1):
310
+ text = ""
311
+ for block in page.blocks:
312
+ text += "\n\t"
313
+ for line in block.lines:
314
+ for word in line.words:
315
+ # print(dir(word))
316
+ text += word.value + " "
317
+ full_doc.append(text)
318
+
319
+ return "\n".join(full_doc) if as_text else full_doc
320
+
321
+
322
+ def convert_PDF_to_Text(
323
+ PDF_file,
324
+ ocr_model=None,
325
+ max_pages: int = 20,
326
+ ):
327
+
328
+ st = time.perf_counter()
329
+ PDF_file = Path(PDF_file)
330
+ ocr_model = ocr_predictor(pretrained=True) if ocr_model is None else ocr_model
331
+ logging.info(f"starting OCR on {PDF_file.name}")
332
+ doc = DocumentFile.from_pdf(PDF_file)
333
+ truncated = False
334
+ if len(doc) > max_pages:
335
+ logging.warning(
336
+ f"PDF has {len(doc)} pages, which is more than {max_pages}.. truncating"
337
+ )
338
+ doc = doc[:max_pages]
339
+ truncated = True
340
+
341
+ # Analyze
342
+ logging.info(f"running OCR on {len(doc)} pages")
343
+ result = ocr_model(doc)
344
+ raw_text = result2text(result)
345
+ proc_text = [format_ocr_out(r) for r in raw_text]
346
+ fin_text = [postprocess(t) for t in proc_text]
347
+
348
+ ocr_results = "\n\n".join(fin_text)
349
+
350
+ fn_rt = time.perf_counter() - st
351
+
352
+ logging.info("OCR complete")
353
+
354
+ results_dict = {
355
+ "num_pages": len(doc),
356
+ "runtime": round(fn_rt, 2),
357
+ "date": str(date.today()),
358
+ "converted_text": ocr_results,
359
+ "truncated": truncated,
360
+ "length": len(ocr_results),
361
+ }
362
+
363
+ return results_dict
364
+
365
+
366
+ # @title translation functions
367
+
368
+ lt = LibreTranslateAPI("https://translate.astian.org/")
369
+
370
+
371
+ def translate_text(text, source_l, target_l="en"):
372
+
373
+ return str(lt.translate(text, source_l, target_l))
374
+
375
+
376
+ def translate_doc(filepath, lang_start, lang_end="en", verbose=False):
377
+ """translate a document from lang_start to lang_end
378
+
379
+ {'code': 'en', 'name': 'English'},
380
+ {'code': 'fr', 'name': 'French'},
381
+ {'code': 'de', 'name': 'German'},
382
+ {'code': 'it', 'name': 'Italian'},"""
383
+
384
+ src_folder = dirname(filepath)
385
+ src_folder = Path(src_folder)
386
+ trgt_folder = src_folder / f"translated_{lang_end}"
387
+ trgt_folder.mkdir(exist_ok=True)
388
+ with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
389
+ foreign_t = f.readlines()
390
+ in_name = basename(filepath)
391
+ translated_doc = []
392
+ for line in tqdm(
393
+ foreign_t, total=len(foreign_t), desc="translating {}...".format(in_name[:10])
394
+ ):
395
+ translated_line = translate_text(line, lang_start, lang_end)
396
+ translated_doc.append(translated_line)
397
+ t_out_name = "[To {}]".format(lang_end) + simple_rename(in_name) + ".txt"
398
+ out_path = join(trgt_folder, t_out_name)
399
+ with open(out_path, "w", encoding="utf-8", errors="ignore") as f_o:
400
+ f_o.writelines(translated_doc)
401
+ if verbose:
402
+ print("finished translating the document! - ", datetime.now())
403
+ return out_path
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ clean-text[gpl]
2
+ python-doctr[torch]
3
+ gradio
4
+ libretranslatepy
5
+ natsort
6
+ nltk
7
+ pyspellchecker
8
+ torch
9
+ tqdm