sblumenf commited on
Commit
28f23fa
·
verified ·
1 Parent(s): e17150e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -18
app.py CHANGED
@@ -10,6 +10,17 @@ import pdfplumber
10
  import tempfile
11
  import traceback
12
 
 
 
 
 
 
 
 
 
 
 
 
13
  def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
14
  """
15
  Parses a PDF file, extracts text, tables, and images, and formats the output.
@@ -34,23 +45,7 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
34
  if isinstance(element, LTTextBoxHorizontal):
35
  text += element.get_text()
36
  elif isinstance(element, (LTFigure, LTImage)):
37
- try:
38
- if hasattr(element, 'stream'):
39
- image_data = element.stream.get_rawdata()
40
- image = Image.open(io.BytesIO(image_data))
41
- image_filename = f"extracted_image_{len(images)}.png"
42
- image.save(image_filename)
43
- images.append({"filename": image_filename})
44
- else:
45
- for child in element:
46
- if isinstance(child, LTImage) and hasattr(child, 'stream'):
47
- image_data = child.stream.get_rawdata()
48
- image = Image.open(io.BytesIO(image_data))
49
- image_filename = f"extracted_image_{len(images)}.png"
50
- image.save(image_filename)
51
- images.append({"filename": image_filename})
52
- except Exception as e:
53
- print(f"Error extracting image: {e}")
54
 
55
  with pdfplumber.open(pdf_file) as pdf:
56
  for page_num, page in enumerate(pdf.pages):
@@ -120,4 +115,4 @@ iface = gr.Interface(
120
  )
121
 
122
  if __name__ == "__main__":
123
- iface.launch() # Temporarily disable sharing for debugging
 
10
  import tempfile
11
  import traceback
12
 
13
+ def save_image(element, images):
14
+ try:
15
+ if hasattr(element, 'stream'):
16
+ image_data = element.stream.get_rawdata()
17
+ image = Image.open(io.BytesIO(image_data))
18
+ image_filename = f"extracted_image_{len(images)}.png"
19
+ image.save(image_filename)
20
+ images.append({"filename": image_filename})
21
+ except Exception as e:
22
+ print(f"Error extracting image: {e}")
23
+
24
  def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
25
  """
26
  Parses a PDF file, extracts text, tables, and images, and formats the output.
 
45
  if isinstance(element, LTTextBoxHorizontal):
46
  text += element.get_text()
47
  elif isinstance(element, (LTFigure, LTImage)):
48
+ save_image(element, images)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  with pdfplumber.open(pdf_file) as pdf:
51
  for page_num, page in enumerate(pdf.pages):
 
115
  )
116
 
117
  if __name__ == "__main__":
118
+ iface.launch() # Temporarily disable sharing for debugging