Spaces:
Sleeping
Sleeping
Update file_processing.py
Browse files- file_processing.py +14 -5
file_processing.py
CHANGED
@@ -28,13 +28,22 @@ from typing import List
|
|
28 |
import fitz # PyMuPDF
|
29 |
import pandas as pd
|
30 |
import docx
|
|
|
31 |
from langchain.docstore.document import Document
|
32 |
def read_pdf(file_path: str) -> str:
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
def read_docx(file_path: str) -> str:
|
40 |
doc = docx.Document(file_path)
|
|
|
28 |
import fitz # PyMuPDF
|
29 |
import pandas as pd
|
30 |
import docx
|
31 |
+
import tempfile
|
32 |
from langchain.docstore.document import Document
|
33 |
def read_pdf(file_path: str) -> str:
|
34 |
+
with tempfile.NamedTemporaryFile(suffix=".pdf") as temp_pdf:
|
35 |
+
# Write the uploaded file's content to the temporary file
|
36 |
+
temp_pdf.write(file.file.read())
|
37 |
+
temp_pdf.seek(0) # Go to the start of the file
|
38 |
+
|
39 |
+
# Open the PDF with fitz
|
40 |
+
doc = fitz.open(temp_pdf.name)
|
41 |
+
text = ""
|
42 |
+
for page in doc:
|
43 |
+
text += page.get_text()
|
44 |
+
|
45 |
+
# No need to delete the temporary file - it's done automatically
|
46 |
+
return text
|
47 |
|
48 |
def read_docx(file_path: str) -> str:
|
49 |
doc = docx.Document(file_path)
|