Spaces:
Running
on
Zero
Running
on
Zero
Liam Dyer
commited on
rebuild pdf reader after ocr
Browse files
app.py
CHANGED
@@ -18,11 +18,14 @@ def convert(pdf_file):
|
|
18 |
out_pdf_file = pdf_file.replace(".pdf", "_ocr.pdf")
|
19 |
ocrmypdf.ocr(pdf_file, out_pdf_file, force_ocr=True)
|
20 |
pdf_file = out_pdf_file
|
|
|
21 |
|
22 |
# Extract text
|
23 |
full_text = ""
|
24 |
for idx, page in enumerate(reader.pages):
|
25 |
-
|
|
|
|
|
26 |
|
27 |
# Extract metadata
|
28 |
metadata = {
|
|
|
18 |
out_pdf_file = pdf_file.replace(".pdf", "_ocr.pdf")
|
19 |
ocrmypdf.ocr(pdf_file, out_pdf_file, force_ocr=True)
|
20 |
pdf_file = out_pdf_file
|
21 |
+
reader = PdfReader(pdf_file)
|
22 |
|
23 |
# Extract text
|
24 |
full_text = ""
|
25 |
for idx, page in enumerate(reader.pages):
|
26 |
+
text = page.extract_text()
|
27 |
+
if len(text) > 0:
|
28 |
+
full_text += f"\n\n---- Page {idx} ----\n\n" + page.extract_text()
|
29 |
|
30 |
# Extract metadata
|
31 |
metadata = {
|