sblumenf commited on
Commit
8e00b94
·
verified ·
1 Parent(s): ce354c6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -46
app.py CHANGED
@@ -1,57 +1,25 @@
1
- import PyPDF2
2
- from pdfminer.high_level import extract_pages
3
- from pdfminer.layout import LTTextBoxHorizontal, LTFigure
4
  import gradio as gr
5
 
6
- def process_figure(fig):
7
- # Replace this with your actual figure processing logic (e.g., save image, get URL)
8
- # This is a placeholder for demonstration purposes
9
- processed_image_url = "https://via.placeholder.com/150" # Placeholder image URL
10
- return processed_image_url
11
-
12
  def parse_pdf(pdf_file, output_format):
13
  with open(pdf_file, 'rb') as file:
14
- pages = extract_pages(file)
15
-
16
- text = ""
17
- tables = [] # Placeholder for tables (implementation needed)
18
- figures = []
19
-
20
- for page in pages:
21
- for element in page:
22
- if isinstance(element, LTTextBoxHorizontal):
23
- text += element.get_text()
24
- elif isinstance(element, LTFigure):
25
- figures.append(element)
26
-
27
- # Extract tables (more advanced techniques might be needed)
28
- # ... (Implement table extraction logic here)
29
-
30
- if output_format == "JSON":
31
- # Replace this with your JSON conversion logic, including tables and figures
32
- json_output = {"text": text, "figures": figures} # Placeholder for JSON conversion
33
- return json_output
34
- elif output_format == "Markdown":
35
- processed_image_url = ""
36
- markdown_output = f"# Extracted Text\n\n{text}\n\n# Figures\n"
37
- for fig in figures:
38
- # Process each figure (e.g., save as image)
39
- processed_image_url = process_figure(fig)
40
- markdown_output += f"\n![]({processed_image_url})"
41
- return markdown_output
42
- elif output_format == "HTML":
43
- processed_image_url = "" # Define outside the loop for HTML output
44
- html_output = f"<p>{text}</p>\n"
45
- for fig in figures:
46
- # Process each figure (e.g., save as image)
47
- processed_image_url = process_figure(fig)
48
- html_output += f"<img src='{processed_image_url}' alt='Figure'>"
49
- return html_output
50
 
51
  # Create the Gradio interface
52
  iface = gr.Interface(
53
  fn=parse_pdf,
54
- inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])],
55
  outputs="text",
56
  title="PDF Parser",
57
  description="Parse a PDF and choose the output format."
 
1
+ import marker
 
 
2
  import gradio as gr
3
 
 
 
 
 
 
 
4
  def parse_pdf(pdf_file, output_format):
5
  with open(pdf_file, 'rb') as file:
6
+ if output_format == "Markdown":
7
+ markdown_text = marker.convert(file)
8
+ return markdown_text
9
+ elif output_format == "HTML":
10
+ # Convert to HTML using marker-pdf's advanced parsing capabilities
11
+ # You might need to explore additional options and parameters for fine-tuning the output
12
+ html_text = marker.convert(file, output_format="html")
13
+ return html_text
14
+ elif output_format == "JSON":
15
+ # Convert to JSON using marker-pdf's structured output
16
+ json_output = marker.convert(file, output_format="json")
17
+ return json_output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  # Create the Gradio interface
20
  iface = gr.Interface(
21
  fn=parse_pdf,
22
+ inputs=["file", gr.Dropdown(["Markdown", "HTML", "JSON"])],
23
  outputs="text",
24
  title="PDF Parser",
25
  description="Parse a PDF and choose the output format."