Richard Hsu commited on
Commit
f59bbd9
·
1 Parent(s): 01f0fe3
Files changed (1) hide show
  1. app.py +21 -11
app.py CHANGED
@@ -1,16 +1,26 @@
1
  import gradio as gr
2
- from langchain.document_loaders import PyPDFLoader
3
 
4
- def pdf_to_text(pdf_file):
5
- loader = PyPDFLoader(pdf_file.name)
6
- documents = loader.load()
7
- text = "\n".join([doc.page_content for doc in documents])
8
- print(text) # Log the loaded text
 
 
 
 
 
9
  return text
10
 
11
- def pdf_to_text_interface(pdf_file):
12
- text = pdf_to_text(pdf_file)
13
- return text
 
 
 
 
 
14
 
15
- iface = gr.Interface(fn=pdf_to_text_interface, inputs="file", outputs="text", title="PDF to Text Converter <3")
16
- iface.launch()
 
1
  import gradio as gr
2
+ import fitz # PyMuPDF
3
 
4
+ def extract_text_from_pdf(pdf_file):
5
+ # Open the PDF file
6
+ pdf_document = fitz.open(pdf_file.name)
7
+ text = ""
8
+
9
+ # Extract text from each page
10
+ for page_num in range(len(pdf_document)):
11
+ page = pdf_document.load_page(page_num)
12
+ text += page.get_text()
13
+
14
  return text
15
 
16
+ # Create a Gradio interface
17
+ interface = gr.Interface(
18
+ fn=extract_text_from_pdf,
19
+ inputs=gr.inputs.File(label="Upload PDF"),
20
+ outputs=gr.outputs.Textbox(label="Extracted Text"),
21
+ title="PDF Text Extractor",
22
+ description="Upload a PDF file to extract and display its text content."
23
+ )
24
 
25
+ # Launch the interface
26
+ interface.launch(share=True)