sblumenf commited on
Commit
b4b5bbe
·
verified ·
1 Parent(s): cc5c62b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -19
app.py CHANGED
@@ -1,27 +1,54 @@
 
 
 
1
  import gradio as gr
2
- import PyMuPDF as fitz # Importing PyMuPDF as fitz
3
 
4
- # Function to extract text from a PDF
5
- def extract_pdf_text(file):
6
- doc = fitz.open(file.name) # Open the PDF file using PyMuPDF
7
- text = ""
8
- for page in doc:
9
- text += page.get_text() # Extract text from each page
10
- return text
11
 
12
- # Gradio interface
13
- output_format_dropdown = gr.Dropdown(
14
- choices=["txt", "pdf", "docx"],
15
- label="Output Format",
16
- default="txt"
17
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
 
19
  iface = gr.Interface(
20
- fn=extract_pdf_text,
21
- inputs=gr.File(label="Upload PDF File"),
22
- outputs=[gr.Textbox(label="Extracted Text"), output_format_dropdown],
23
- live=True
 
24
  )
25
 
 
26
  if __name__ == "__main__":
27
- iface.launch()
 
1
+ import PyPDF2
2
+ from pdfminer.high_level import extract_pages
3
+ from pdfminer.layout import LTTextBoxHorizontal, LTFigure
4
  import gradio as gr
 
5
 
6
+ def parse_pdf(pdf_file, output_format):
7
+ with open(pdf_file, 'rb') as file:
8
+ pages = extract_pages(file)
 
 
 
 
9
 
10
+ text = ""
11
+ tables = []
12
+ figures = []
13
+
14
+ for page in pages:
15
+ for element in page:
16
+ if isinstance(element, LTTextBoxHorizontal):
17
+ text += element.get_text()
18
+ elif isinstance(element, LTFigure):
19
+ figures.append(element)
20
+
21
+ # Extract tables (more advanced techniques might be needed)
22
+ # ...
23
+
24
+ if output_format == "JSON":
25
+ # Replace this with your JSON conversion logic, including tables and figures
26
+ json_output = {"text": text, "figures": figures} # Placeholder for JSON conversion
27
+ return json_output
28
+ elif output_format == "Markdown":
29
+ # Replace this with your Markdown conversion logic, including tables and figures
30
+ markdown_output = f"# Extracted Text\n\n{text}\n\n# Figures\n"
31
+ for fig in figures:
32
+ # Handle figure conversion (e.g., saving as images)
33
+ # ...
34
+ return markdown_output
35
+ elif output_format == "HTML":
36
+ # Replace this with your HTML conversion logic, including tables and figures
37
+ html_output = f"<p>{text}</p>\n"
38
+ for fig in figures:
39
+ # Handle figure conversion (e.g., embedding images)
40
+ # ...
41
+ return html_output
42
 
43
+ # Create the Gradio interface
44
  iface = gr.Interface(
45
+ fn=parse_pdf,
46
+ inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])],
47
+ outputs="text",
48
+ title="PDF Parser",
49
+ description="Parse a PDF and choose the output format."
50
  )
51
 
52
+ # Launch the Gradio app
53
  if __name__ == "__main__":
54
+ iface.launch()