Richard Hsu commited on
Commit
9a0b9e9
·
1 Parent(s): e32d984
Files changed (1) hide show
  1. app.py +5 -11
app.py CHANGED
@@ -1,16 +1,11 @@
1
- import fitz # PyMuPDF
2
- import pytesseract
3
  import gradio as gr
4
- from PIL import Image
5
 
6
  def pdf_to_text(pdf_file):
7
- doc = fitz.open(pdf_file)
8
- text = ""
9
- for page in doc:
10
- pix = page.get_pixmap()
11
- img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
12
- text += pytesseract.image_to_string(img)
13
- doc.close()
14
  return text
15
 
16
  def pdf_to_text_interface(pdf_file):
@@ -19,4 +14,3 @@ def pdf_to_text_interface(pdf_file):
19
 
20
  iface = gr.Interface(fn=pdf_to_text_interface, inputs="file", outputs="text", title="PDF to Text Converter <3")
21
  iface.launch()
22
-
 
 
 
1
  import gradio as gr
2
+ from langchain.document_loaders import PyPDFLoader
3
 
4
  def pdf_to_text(pdf_file):
5
+ loader = PyPDFLoader(pdf_file.name)
6
+ documents = loader.load()
7
+ text = "\n".join([doc.page_content for doc in documents])
8
+ print(text) # Log the loaded text
 
 
 
9
  return text
10
 
11
  def pdf_to_text_interface(pdf_file):
 
14
 
15
  iface = gr.Interface(fn=pdf_to_text_interface, inputs="file", outputs="text", title="PDF to Text Converter <3")
16
  iface.launch()