sblumenf commited on
Commit
546291c
·
verified ·
1 Parent(s): 5e94ef1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -54
app.py CHANGED
@@ -5,36 +5,7 @@ from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage
5
  import os # Import os for file path manipulation
6
 
7
  def parse_pdf(pdf_file, output_format):
8
- with open(pdf_file, 'rb') as file:
9
- pages = extract_pages(file)
10
-
11
- text = ""
12
- tables = [] # Placeholder for extracted table data
13
- images = [] # List to store extracted image data
14
-
15
- for page in pages:
16
- for element in page:
17
- if isinstance(element, LTTextBoxHorizontal):
18
- text += element.get_text()
19
- elif isinstance(element, (LTFigure, LTImage)):
20
- # Extract image data (e.g., save as image, convert to base64)
21
- # ... (Implement image processing logic)
22
- # Here's an example of extracting image data and saving the image
23
- if hasattr(element, 'stream'): # Check for image data stream (LTImage)
24
- image_data = element.stream.read()
25
- else: # Handle LTFigure (may require additional processing)
26
- # ... (Implement logic to extract image data from LTFigure)
27
- # You might need libraries like Pillow for image manipulation
28
- image_data = b"Placeholder for extracted image data" # Example placeholder
29
-
30
- image_filename = f"extracted_image_{len(images)}.jpg"
31
- with open(image_filename, 'wb') as image_file:
32
- image_file.write(image_data)
33
- images.append({"filename": image_filename}) # Add filename to image data
34
-
35
- # Implement table extraction logic (e.g., using heuristics or advanced techniques)
36
- # You can use libraries like Camelot for complex tables
37
- # ...
38
 
39
  # Convert extracted data to desired format and populate download_data
40
  if output_format == "JSON":
@@ -43,34 +14,17 @@ def parse_pdf(pdf_file, output_format):
43
  "tables": tables, # Replace with actual table data
44
  "images": images # List of dictionaries with filenames
45
  }
46
- download_data = json.dumps(json_data).encode("utf-8") # Encode JSON for download
47
 
48
  elif output_format == "Markdown":
49
- # Implement table conversion using mistletoe or other Markdown libraries (uncomment if needed)
50
- # markdown_tables = mistletoe.markdown(convert_table=True)(tables) # Example using mistletoe
51
-
52
- markdown_text = f"# Extracted Text\n\n{text}\n\n# Images\n"
53
- # Implement logic to embed images within Markdown (optional)
54
- # ... (e.g., use relative paths if images are saved locally)
55
- # or (consider alternative Markdown image embedding methods)
56
- download_data = markdown_text.encode("utf-8")
57
 
58
  elif output_format == "HTML":
59
- # Implement table conversion using HTML table tags
60
- html_tables = "<table>" # Start of HTML table (replace with actual table structure)
61
- # ... (Implement table data conversion to HTML)
62
- # html_tables += "</table>"
63
-
64
- html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n{html_tables}\n\n<h2>Images</h2>\n"
65
- # Implement logic to display images within HTML (optional)
66
- # ... (e.g., use `<img>` tags with image source)
67
- download_data = html_text.encode("utf-8")
68
-
69
- # Create a temporary directory to store downloaded files (optional)
70
- # download_dir = tempfile.mkdtemp() # Uncomment if needed for temporary storage
71
 
72
- # Return the extracted text and the filename (or path) for download
73
- return text, os.path.join(os.getcwd(), images[0]["filename"]) # Example using first image
74
 
75
  iface = gr.Interface(
76
  fn=parse_pdf,
@@ -84,4 +38,4 @@ iface = gr.Interface(
84
  )
85
 
86
  if __name__ == "__main__":
87
- iface.launch(share=False) # Set share=False
 
5
  import os # Import os for file path manipulation
6
 
7
  def parse_pdf(pdf_file, output_format):
8
+ # ... (Your existing parsing logic)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  # Convert extracted data to desired format and populate download_data
11
  if output_format == "JSON":
 
14
  "tables": tables, # Replace with actual table data
15
  "images": images # List of dictionaries with filenames
16
  }
17
+ download_data = json.dumps(json_data) # No need to encode as Gradio handles it
18
 
19
  elif output_format == "Markdown":
20
+ # ... (Your Markdown conversion logic)
21
+ download_data = markdown_text
 
 
 
 
 
 
22
 
23
  elif output_format == "HTML":
24
+ # ... (Your HTML conversion logic)
25
+ download_data = html_text
 
 
 
 
 
 
 
 
 
 
26
 
27
+ return text, download_data
 
28
 
29
  iface = gr.Interface(
30
  fn=parse_pdf,
 
38
  )
39
 
40
  if __name__ == "__main__":
41
+ iface.launch(share=False) # Set share=False as Gradio warns about it on Hugging Face Spaces