Sk4467 commited on
Commit
6c96db3
·
verified ·
1 Parent(s): 8967645

Update file_processing.py

Browse files
Files changed (1) hide show
  1. file_processing.py +8 -13
file_processing.py CHANGED
@@ -30,20 +30,15 @@ import pandas as pd
30
  import docx
31
  import tempfile
32
  from langchain.docstore.document import Document
 
33
  def read_pdf(file_path: str) -> str:
34
- with tempfile.NamedTemporaryFile(suffix=".pdf") as temp_pdf:
35
- # Write the uploaded file's content to the temporary file
36
- temp_pdf.write(file.file.read())
37
- temp_pdf.seek(0) # Go to the start of the file
38
-
39
- # Open the PDF with fitz
40
- doc = fitz.open(temp_pdf.name)
41
- text = ""
42
- for page in doc:
43
- text += page.get_text()
44
-
45
- # No need to delete the temporary file - it's done automatically
46
- return text
47
 
48
  def read_docx(file_path: str) -> str:
49
  doc = docx.Document(file_path)
 
30
  import docx
31
  import tempfile
32
  from langchain.docstore.document import Document
33
+
34
  def read_pdf(file_path: str) -> str:
35
+ # Open the PDF with fitz
36
+ doc = fitz.open(file_path)
37
+ text = ""
38
+ for page in doc:
39
+ text += page.get_text()
40
+
41
+ return text
 
 
 
 
 
 
42
 
43
  def read_docx(file_path: str) -> str:
44
  doc = docx.Document(file_path)