Liam Dyer commited on
Commit
bf804f0
·
unverified ·
1 Parent(s): f3f7b6d

rebuild pdf reader after ocr

Browse files
Files changed (1) hide show
  1. app.py +4 -1
app.py CHANGED
@@ -18,11 +18,14 @@ def convert(pdf_file):
18
  out_pdf_file = pdf_file.replace(".pdf", "_ocr.pdf")
19
  ocrmypdf.ocr(pdf_file, out_pdf_file, force_ocr=True)
20
  pdf_file = out_pdf_file
 
21
 
22
  # Extract text
23
  full_text = ""
24
  for idx, page in enumerate(reader.pages):
25
- full_text += f"\n\n---- Page {idx} ----\n\n" + page.extract_text()
 
 
26
 
27
  # Extract metadata
28
  metadata = {
 
18
  out_pdf_file = pdf_file.replace(".pdf", "_ocr.pdf")
19
  ocrmypdf.ocr(pdf_file, out_pdf_file, force_ocr=True)
20
  pdf_file = out_pdf_file
21
+ reader = PdfReader(pdf_file)
22
 
23
  # Extract text
24
  full_text = ""
25
  for idx, page in enumerate(reader.pages):
26
+ text = page.extract_text()
27
+ if len(text) > 0:
28
+ full_text += f"\n\n---- Page {idx} ----\n\n" + page.extract_text()
29
 
30
  # Extract metadata
31
  metadata = {