Sk4467 commited on
Commit
8967645
·
verified ·
1 Parent(s): d076fbf

Update file_processing.py

Browse files
Files changed (1) hide show
  1. file_processing.py +7 -6
file_processing.py CHANGED
@@ -60,16 +60,18 @@ def read_txt(file_path: str) -> str:
60
  with open(file_path, 'r', encoding='utf-8') as file:
61
  return file.read()
62
 
63
- async def load_documents(file: UploadFile)->List[Document]:
64
  temp_file_path = f"temp_{file.filename}"
65
  try:
66
  # Save the uploaded file to a temporary file
67
  with open(temp_file_path, "wb") as temp_file:
68
- temp_file.write(await file.read())
 
69
 
 
70
  content = ""
71
  if file.filename.endswith('.pdf'):
72
- content = read_pdf(temp_file_path)
73
  elif file.filename.endswith('.docx'):
74
  content = read_docx(temp_file_path)
75
  elif file.filename.endswith('.csv'):
@@ -79,19 +81,18 @@ async def load_documents(file: UploadFile)->List[Document]:
79
  else:
80
  raise ValueError("Unsupported file format")
81
  except Exception as e:
82
- # Handle general errors - log or adjust as necessary for your application
83
  print(f"Error processing document: {e}")
84
  content = "Error processing document."
85
  finally:
86
- # Cleanup: remove the temporary file
87
  if os.path.exists(temp_file_path):
88
- os.remove(temp_file_path)
89
 
90
  metadata = {'source': file.filename}
91
  document = Document(page_content=content, metadata=metadata)
92
  return [document]
93
 
94
 
 
95
  from langchain.text_splitter import CharacterTextSplitter
96
 
97
  def chunk_documents(documents, chunk_size=1000, chunk_overlap=200):
 
60
  with open(file_path, 'r', encoding='utf-8') as file:
61
  return file.read()
62
 
63
+ async def load_documents(file: UploadFile) -> List[Document]:
64
  temp_file_path = f"temp_{file.filename}"
65
  try:
66
  # Save the uploaded file to a temporary file
67
  with open(temp_file_path, "wb") as temp_file:
68
+ contents = await file.read() # Read the content of the uploaded file
69
+ temp_file.write(contents) # Write the content to the temporary file
70
 
71
+ # Now you can pass temp_file_path to your read functions
72
  content = ""
73
  if file.filename.endswith('.pdf'):
74
+ content = read_pdf(temp_file_path) # Pass the path, not the file object
75
  elif file.filename.endswith('.docx'):
76
  content = read_docx(temp_file_path)
77
  elif file.filename.endswith('.csv'):
 
81
  else:
82
  raise ValueError("Unsupported file format")
83
  except Exception as e:
 
84
  print(f"Error processing document: {e}")
85
  content = "Error processing document."
86
  finally:
 
87
  if os.path.exists(temp_file_path):
88
+ os.remove(temp_file_path) # Clean up the temporary file
89
 
90
  metadata = {'source': file.filename}
91
  document = Document(page_content=content, metadata=metadata)
92
  return [document]
93
 
94
 
95
+
96
  from langchain.text_splitter import CharacterTextSplitter
97
 
98
  def chunk_documents(documents, chunk_size=1000, chunk_overlap=200):