sblumenf commited on
Commit
f15272f
·
verified ·
1 Parent(s): f598e4b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -20
app.py CHANGED
@@ -3,27 +3,26 @@ import gradio as gr
3
  from pdfminer.high_level import extract_pages, extract_text
4
  from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage
5
 
6
- # Optional import for Markdown table generation (comment out if not needed)
7
- # import mistletoe
8
-
9
  def parse_pdf(pdf_file, output_format):
10
  with open(pdf_file, 'rb') as file:
11
  pages = extract_pages(file)
12
 
13
  text = ""
14
  tables = [] # Placeholder for extracted table data
15
- images = [] # Placeholder for extracted image data
16
 
17
  for page in pages:
18
  for element in page:
19
  if isinstance(element, LTTextBoxHorizontal):
20
  text += element.get_text()
21
- elif isinstance(element, LTFigure):
22
- # Extract image data (e.g., save as image, convert to base64)
23
- images.append(element)
24
- elif isinstance(element, LTImage):
25
  # Extract image data (e.g., save as image, convert to base64)
26
- images.append(element)
 
 
 
 
 
27
 
28
  # Implement table extraction logic (e.g., using heuristics or advanced techniques)
29
  # You can use libraries like Camelot for complex tables
@@ -34,7 +33,7 @@ def parse_pdf(pdf_file, output_format):
34
  json_data = {
35
  "text": text,
36
  "tables": tables, # Replace with actual table data
37
- "images": images # Replace with actual image data (e.g., base64)
38
  }
39
  download_data = json.dumps(json_data).encode("utf-8") # Encode JSON for download
40
 
@@ -42,13 +41,10 @@ def parse_pdf(pdf_file, output_format):
42
  # Implement table conversion using mistletoe or other Markdown libraries (uncomment if needed)
43
  # markdown_tables = mistletoe.markdown(convert_table=True)(tables) # Example using mistletoe
44
 
45
- markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
46
- # Implement table conversion (e.g., manually create Markdown table structure)
47
- # ... (replace with your table conversion logic)
48
- # markdown_text += markdown_tables # Uncomment if using mistletoe
49
- markdown_text += "\n# Images\n"
50
- # Implement image conversion (e.g., relative paths or base64 encoding)
51
- # ...
52
  download_data = markdown_text.encode("utf-8")
53
 
54
  elif output_format == "HTML":
@@ -58,8 +54,8 @@ def parse_pdf(pdf_file, output_format):
58
  # html_tables += "</table>"
59
 
60
  html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n{html_tables}\n\n<h2>Images</h2>\n"
61
- # Implement image conversion using `<img>` tag
62
- # ...
63
  download_data = html_text.encode("utf-8")
64
 
65
  return text, download_data
@@ -76,4 +72,4 @@ iface = gr.Interface(
76
  )
77
 
78
  if __name__ == "__main__":
79
- iface.launch(share=True) # Set share=True to create a public link
 
3
  from pdfminer.high_level import extract_pages, extract_text
4
  from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage
5
 
 
 
 
6
  def parse_pdf(pdf_file, output_format):
7
  with open(pdf_file, 'rb') as file:
8
  pages = extract_pages(file)
9
 
10
  text = ""
11
  tables = [] # Placeholder for extracted table data
12
+ images = [] # List to store extracted image data
13
 
14
  for page in pages:
15
  for element in page:
16
  if isinstance(element, LTTextBoxHorizontal):
17
  text += element.get_text()
18
+ elif isinstance(element, (LTFigure, LTImage)):
 
 
 
19
  # Extract image data (e.g., save as image, convert to base64)
20
+ # ... (Implement image processing logic)
21
+ # Here's an example of saving images with a unique filename
22
+ image_data = element # Replace with your image extraction logic
23
+ image_filename = f"extracted_image_{len(images)}.jpg"
24
+ # ... (Implement image saving logic using the filename)
25
+ images.append({"filename": image_filename}) # Add filename to image data
26
 
27
  # Implement table extraction logic (e.g., using heuristics or advanced techniques)
28
  # You can use libraries like Camelot for complex tables
 
33
  json_data = {
34
  "text": text,
35
  "tables": tables, # Replace with actual table data
36
+ "images": images # List of dictionaries with filenames
37
  }
38
  download_data = json.dumps(json_data).encode("utf-8") # Encode JSON for download
39
 
 
41
  # Implement table conversion using mistletoe or other Markdown libraries (uncomment if needed)
42
  # markdown_tables = mistletoe.markdown(convert_table=True)(tables) # Example using mistletoe
43
 
44
+ markdown_text = f"# Extracted Text\n\n{text}\n\n# Images\n"
45
+ # Implement logic to embed images within Markdown (optional)
46
+ # ... (e.g., use relative paths if images are saved locally)
47
+ # or (consider alternative Markdown image embedding methods)
 
 
 
48
  download_data = markdown_text.encode("utf-8")
49
 
50
  elif output_format == "HTML":
 
54
  # html_tables += "</table>"
55
 
56
  html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n{html_tables}\n\n<h2>Images</h2>\n"
57
+ # Implement logic to display images within HTML (optional)
58
+ # ... (e.g., use `<img>` tags with image source)
59
  download_data = html_text.encode("utf-8")
60
 
61
  return text, download_data
 
72
  )
73
 
74
  if __name__ == "__main__":
75
+ iface.launch(share=False) # Set share=False as Gradio warns about it on Hugging Face Spaces