Spaces:
Sleeping
Sleeping
Update file_processing.py
Browse files- file_processing.py +7 -6
file_processing.py
CHANGED
@@ -60,16 +60,18 @@ def read_txt(file_path: str) -> str:
|
|
60 |
with open(file_path, 'r', encoding='utf-8') as file:
|
61 |
return file.read()
|
62 |
|
63 |
-
async def load_documents(file: UploadFile)->List[Document]:
|
64 |
temp_file_path = f"temp_{file.filename}"
|
65 |
try:
|
66 |
# Save the uploaded file to a temporary file
|
67 |
with open(temp_file_path, "wb") as temp_file:
|
68 |
-
|
|
|
69 |
|
|
|
70 |
content = ""
|
71 |
if file.filename.endswith('.pdf'):
|
72 |
-
content = read_pdf(temp_file_path)
|
73 |
elif file.filename.endswith('.docx'):
|
74 |
content = read_docx(temp_file_path)
|
75 |
elif file.filename.endswith('.csv'):
|
@@ -79,19 +81,18 @@ async def load_documents(file: UploadFile)->List[Document]:
|
|
79 |
else:
|
80 |
raise ValueError("Unsupported file format")
|
81 |
except Exception as e:
|
82 |
-
# Handle general errors - log or adjust as necessary for your application
|
83 |
print(f"Error processing document: {e}")
|
84 |
content = "Error processing document."
|
85 |
finally:
|
86 |
-
# Cleanup: remove the temporary file
|
87 |
if os.path.exists(temp_file_path):
|
88 |
-
os.remove(temp_file_path)
|
89 |
|
90 |
metadata = {'source': file.filename}
|
91 |
document = Document(page_content=content, metadata=metadata)
|
92 |
return [document]
|
93 |
|
94 |
|
|
|
95 |
from langchain.text_splitter import CharacterTextSplitter
|
96 |
|
97 |
def chunk_documents(documents, chunk_size=1000, chunk_overlap=200):
|
|
|
60 |
with open(file_path, 'r', encoding='utf-8') as file:
|
61 |
return file.read()
|
62 |
|
63 |
+
async def load_documents(file: UploadFile) -> List[Document]:
|
64 |
temp_file_path = f"temp_{file.filename}"
|
65 |
try:
|
66 |
# Save the uploaded file to a temporary file
|
67 |
with open(temp_file_path, "wb") as temp_file:
|
68 |
+
contents = await file.read() # Read the content of the uploaded file
|
69 |
+
temp_file.write(contents) # Write the content to the temporary file
|
70 |
|
71 |
+
# Now you can pass temp_file_path to your read functions
|
72 |
content = ""
|
73 |
if file.filename.endswith('.pdf'):
|
74 |
+
content = read_pdf(temp_file_path) # Pass the path, not the file object
|
75 |
elif file.filename.endswith('.docx'):
|
76 |
content = read_docx(temp_file_path)
|
77 |
elif file.filename.endswith('.csv'):
|
|
|
81 |
else:
|
82 |
raise ValueError("Unsupported file format")
|
83 |
except Exception as e:
|
|
|
84 |
print(f"Error processing document: {e}")
|
85 |
content = "Error processing document."
|
86 |
finally:
|
|
|
87 |
if os.path.exists(temp_file_path):
|
88 |
+
os.remove(temp_file_path) # Clean up the temporary file
|
89 |
|
90 |
metadata = {'source': file.filename}
|
91 |
document = Document(page_content=content, metadata=metadata)
|
92 |
return [document]
|
93 |
|
94 |
|
95 |
+
|
96 |
from langchain.text_splitter import CharacterTextSplitter
|
97 |
|
98 |
def chunk_documents(documents, chunk_size=1000, chunk_overlap=200):
|