Richard Hsu commited on
Commit
f5be035
·
1 Parent(s): df0d3a5
Files changed (2) hide show
  1. app.py +7 -7
  2. requirements.txt +1 -1
app.py CHANGED
@@ -1,15 +1,15 @@
1
  import gradio as gr
2
- import fitz # PyMuPDF
3
 
4
  def extract_text_from_pdf(pdf_file):
5
- # Open the PDF file
6
- pdf_document = fitz.open(pdf_file.name)
 
7
  text = ""
8
 
9
- # Extract text from each page
10
- for page_num in range(len(pdf_document)):
11
- page = pdf_document.load_page(page_num)
12
- text += page.get_text()
13
 
14
  return text
15
 
 
1
  import gradio as gr
2
+ from langchain.document_loaders import PyPDFLoader
3
 
4
  def extract_text_from_pdf(pdf_file):
5
+ # Load the PDF file using PyPDFLoader
6
+ loader = PyPDFLoader(pdf_file.name)
7
+ documents = loader.load()
8
  text = ""
9
 
10
+ # Extract text from each document
11
+ for document in documents:
12
+ text += document.page_content
 
13
 
14
  return text
15
 
requirements.txt CHANGED
@@ -66,5 +66,5 @@ urllib3==2.0.3
66
  uvicorn==0.22.0
67
  websockets==11.0.3
68
  yarl==1.9.2
69
- pypdf
70
  pypdf2
 
66
  uvicorn==0.22.0
67
  websockets==11.0.3
68
  yarl==1.9.2
69
+ pypdf==3.10.0
70
  pypdf2