Spaces:
Running
on
Zero
Running
on
Zero
Liam Dyer
commited on
extract metadata before ocr
Browse files
app.py
CHANGED
@@ -8,6 +8,15 @@ import ocrmypdf
|
|
8 |
def convert(pdf_file):
|
9 |
reader = PdfReader(pdf_file)
|
10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
# Check if there are any images
|
12 |
image_count = 0
|
13 |
for page in reader.pages:
|
@@ -27,15 +36,6 @@ def convert(pdf_file):
|
|
27 |
if len(text) > 0:
|
28 |
full_text += f"---- Page {idx} ----\n" + page.extract_text() + "\n\n"
|
29 |
|
30 |
-
# Extract metadata
|
31 |
-
metadata = {
|
32 |
-
"author": reader.metadata.author,
|
33 |
-
"creator": reader.metadata.creator,
|
34 |
-
"producer": reader.metadata.producer,
|
35 |
-
"subject": reader.metadata.subject,
|
36 |
-
"title": reader.metadata.title,
|
37 |
-
}
|
38 |
-
|
39 |
return full_text.strip(), metadata
|
40 |
|
41 |
|
|
|
8 |
def convert(pdf_file):
|
9 |
reader = PdfReader(pdf_file)
|
10 |
|
11 |
+
# Extract metadata
|
12 |
+
metadata = {
|
13 |
+
"author": reader.metadata.author,
|
14 |
+
"creator": reader.metadata.creator,
|
15 |
+
"producer": reader.metadata.producer,
|
16 |
+
"subject": reader.metadata.subject,
|
17 |
+
"title": reader.metadata.title,
|
18 |
+
}
|
19 |
+
|
20 |
# Check if there are any images
|
21 |
image_count = 0
|
22 |
for page in reader.pages:
|
|
|
36 |
if len(text) > 0:
|
37 |
full_text += f"---- Page {idx} ----\n" + page.extract_text() + "\n\n"
|
38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
return full_text.strip(), metadata
|
40 |
|
41 |
|