Liam Dyer commited on
Commit
3bf066d
·
unverified ·
1 Parent(s): a2dee03

extract metadata before ocr

Browse files
Files changed (1) hide show
  1. app.py +9 -9
app.py CHANGED
@@ -8,6 +8,15 @@ import ocrmypdf
8
  def convert(pdf_file):
9
  reader = PdfReader(pdf_file)
10
 
 
 
 
 
 
 
 
 
 
11
  # Check if there are any images
12
  image_count = 0
13
  for page in reader.pages:
@@ -27,15 +36,6 @@ def convert(pdf_file):
27
  if len(text) > 0:
28
  full_text += f"---- Page {idx} ----\n" + page.extract_text() + "\n\n"
29
 
30
- # Extract metadata
31
- metadata = {
32
- "author": reader.metadata.author,
33
- "creator": reader.metadata.creator,
34
- "producer": reader.metadata.producer,
35
- "subject": reader.metadata.subject,
36
- "title": reader.metadata.title,
37
- }
38
-
39
  return full_text.strip(), metadata
40
 
41
 
 
8
  def convert(pdf_file):
9
  reader = PdfReader(pdf_file)
10
 
11
+ # Extract metadata
12
+ metadata = {
13
+ "author": reader.metadata.author,
14
+ "creator": reader.metadata.creator,
15
+ "producer": reader.metadata.producer,
16
+ "subject": reader.metadata.subject,
17
+ "title": reader.metadata.title,
18
+ }
19
+
20
  # Check if there are any images
21
  image_count = 0
22
  for page in reader.pages:
 
36
  if len(text) > 0:
37
  full_text += f"---- Page {idx} ----\n" + page.extract_text() + "\n\n"
38
 
 
 
 
 
 
 
 
 
 
39
  return full_text.strip(), metadata
40
 
41