Spaces:

huggingchat
/

pdf-to-markdown

Runtime error

Liam Dyer commited on May 22, 2024

Commit

bf804f0

unverified ·

1 Parent(s): f3f7b6d

rebuild pdf reader after ocr

Files changed (1) hide show

app.py CHANGED Viewed

@@ -18,11 +18,14 @@ def convert(pdf_file):
         out_pdf_file = pdf_file.replace(".pdf", "_ocr.pdf")
         ocrmypdf.ocr(pdf_file, out_pdf_file, force_ocr=True)
         pdf_file = out_pdf_file
     # Extract text
     full_text = ""
     for idx, page in enumerate(reader.pages):
-        full_text += f"\n\n---- Page {idx} ----\n\n" + page.extract_text()
     # Extract metadata
     metadata = {

         out_pdf_file = pdf_file.replace(".pdf", "_ocr.pdf")
         ocrmypdf.ocr(pdf_file, out_pdf_file, force_ocr=True)
         pdf_file = out_pdf_file
+        reader = PdfReader(pdf_file)
     # Extract text
     full_text = ""
     for idx, page in enumerate(reader.pages):
+        text = page.extract_text()
+        if len(text) > 0:
+            full_text += f"\n\n---- Page {idx} ----\n\n" + page.extract_text()
     # Extract metadata
     metadata = {