Sk4467 commited on
Commit
d42007c
·
verified ·
1 Parent(s): 5a893a5

Update file_processing.py

Browse files
Files changed (1) hide show
  1. file_processing.py +14 -5
file_processing.py CHANGED
@@ -28,13 +28,22 @@ from typing import List
28
  import fitz # PyMuPDF
29
  import pandas as pd
30
  import docx
 
31
  from langchain.docstore.document import Document
32
  def read_pdf(file_path: str) -> str:
33
- doc = fitz.open(file_path)
34
- text = ""
35
- for page in doc:
36
- text += page.get_text()
37
- return text
 
 
 
 
 
 
 
 
38
 
39
  def read_docx(file_path: str) -> str:
40
  doc = docx.Document(file_path)
 
28
  import fitz # PyMuPDF
29
  import pandas as pd
30
  import docx
31
+ import tempfile
32
  from langchain.docstore.document import Document
33
  def read_pdf(file_path: str) -> str:
34
+ with tempfile.NamedTemporaryFile(suffix=".pdf") as temp_pdf:
35
+ # Write the uploaded file's content to the temporary file
36
+ temp_pdf.write(file.file.read())
37
+ temp_pdf.seek(0) # Go to the start of the file
38
+
39
+ # Open the PDF with fitz
40
+ doc = fitz.open(temp_pdf.name)
41
+ text = ""
42
+ for page in doc:
43
+ text += page.get_text()
44
+
45
+ # No need to delete the temporary file - it's done automatically
46
+ return text
47
 
48
  def read_docx(file_path: str) -> str:
49
  doc = docx.Document(file_path)