sblumenf commited on
Commit
7d2c1f9
·
verified ·
1 Parent(s): e147409

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -21
app.py CHANGED
@@ -1,31 +1,47 @@
1
  from marker import convert
2
  import gradio as gr
3
 
4
- def parse_pdf(pdf_file, output_format):
5
- with open(pdf_file, 'rb') as file:
6
- if output_format == "Markdown":
7
- markdown_text = convert(file)
8
- return markdown_text
9
- elif output_format == "HTML":
10
- # Convert to HTML using marker-pdf's advanced parsing capabilities
11
- # You might need to explore additional options and parameters for fine-tuning the output
12
- html_text = convert(file, output_format="html")
13
- return html_text
14
- elif output_format == "JSON":
15
- # Convert to JSON using marker-pdf's structured output
16
- json_output = convert(file, output_format="json")
17
- return json_output
18
-
19
- # Create the Gradio interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  iface = gr.Interface(
21
  fn=parse_pdf,
22
- inputs=["file", gr.Dropdown(["Markdown", "HTML", "JSON"])],
23
  outputs="text",
24
  title="PDF Parser",
25
  description="Parse a PDF and choose the output format."
26
  )
27
 
28
- # Launch the Gradio app (optional: add share=True for a public link)
29
- if __name__ == "__main__":
30
- iface.launch() # For local development
31
- # iface.launch(share=True) # For a public link
 
1
  from marker import convert
2
  import gradio as gr
3
 
4
+ def parse_pdf(pdf_content, output_format):
5
+ """
6
+ Parses the content of a PDF into the specified output format.
7
+
8
+ Args:
9
+ pdf_content (bytes): The content of the PDF file in bytes format.
10
+ output_format (str): The desired output format (Markdown, HTML, or JSON).
11
+
12
+ Returns:
13
+ str: The parsed text in the chosen output format.
14
+ """
15
+
16
+ if not isinstance(pdf_content, bytes):
17
+ raise TypeError("Input must be bytes representing the PDF content.")
18
+
19
+ # Use in-memory conversion using marker-pdf for deployment compatibility
20
+ converted_text = convert(pdf_content)
21
+
22
+ if output_format == "Markdown":
23
+ return converted_text
24
+ elif output_format == "HTML":
25
+ # Convert to HTML using marker-pdf's advanced parsing capabilities
26
+ # Explore additional options and parameters for fine-tuning the output
27
+ html_text = convert(pdf_content, output_format="html")
28
+ return html_text
29
+ elif output_format == "JSON":
30
+ # Convert to JSON using marker-pdf's structured output
31
+ json_output = convert(pdf_content, output_format="json")
32
+ return json_output
33
+ else:
34
+ raise ValueError("Invalid output format. Supported formats: Markdown, HTML, JSON")
35
+
36
+ # Define the Gradio interface (no need to install Gradio)
37
  iface = gr.Interface(
38
  fn=parse_pdf,
39
+ inputs=[gr.File(type="pdf", label="Upload PDF"), gr.Dropdown(["Markdown", "HTML", "JSON"])],
40
  outputs="text",
41
  title="PDF Parser",
42
  description="Parse a PDF and choose the output format."
43
  )
44
 
45
+ # Launch the app (not applicable on Hugging Face Spaces)
46
+ # if __name__ == "__main__":
47
+ # iface.launch() # For local development (comment out for deployment)