codelion commited on
Commit
8056ec2
·
verified ·
1 Parent(s): 3dffd64

Update loaders/common.py

Browse files
Files changed (1) hide show
  1. loaders/common.py +20 -9
loaders/common.py CHANGED
@@ -15,32 +15,43 @@ def process_file(vector_store, file, loader_class, file_suffix, stats_db=None):
15
  if file_size > 1000000:
16
  st.error("File size is too large. Please upload a file smaller than 1MB or self host.")
17
  return
18
-
19
  dateshort = time.strftime("%Y%m%d")
20
  with tempfile.NamedTemporaryFile(delete=False, suffix=file_suffix) as tmp_file:
21
  tmp_file.write(file.getvalue())
22
  tmp_file.flush()
23
-
24
  loader = loader_class(tmp_file.name)
25
  documents = loader.load()
26
  file_sha1 = compute_sha1_from_file(tmp_file.name)
27
-
28
  os.remove(tmp_file.name)
29
 
30
  chunk_size = st.session_state['chunk_size']
31
  chunk_overlap = st.session_state['chunk_overlap']
32
-
33
  text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
34
 
35
  documents = text_splitter.split_documents(documents)
36
-
37
  # Add the document sha1 as metadata to each document
38
  docs_with_metadata = [Document(page_content=doc.page_content, metadata={"file_sha1": file_sha1,"file_size":file_size ,"file_name": file_name,
39
  "chunk_size": chunk_size, "chunk_overlap": chunk_overlap, "date": dateshort,
40
  "user" : st.session_state["username"]})
41
  for doc in documents]
42
 
43
- vector_store.add_documents(docs_with_metadata)
44
- if stats_db:
45
- add_usage(stats_db, "embedding", "file", metadata={"file_name": file_name,"file_type": file_suffix,
46
- "chunk_size": chunk_size, "chunk_overlap": chunk_overlap})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  if file_size > 1000000:
16
  st.error("File size is too large. Please upload a file smaller than 1MB or self host.")
17
  return
18
+
19
  dateshort = time.strftime("%Y%m%d")
20
  with tempfile.NamedTemporaryFile(delete=False, suffix=file_suffix) as tmp_file:
21
  tmp_file.write(file.getvalue())
22
  tmp_file.flush()
 
23
  loader = loader_class(tmp_file.name)
24
  documents = loader.load()
25
  file_sha1 = compute_sha1_from_file(tmp_file.name)
 
26
  os.remove(tmp_file.name)
27
 
28
  chunk_size = st.session_state['chunk_size']
29
  chunk_overlap = st.session_state['chunk_overlap']
 
30
  text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
31
 
32
  documents = text_splitter.split_documents(documents)
 
33
  # Add the document sha1 as metadata to each document
34
  docs_with_metadata = [Document(page_content=doc.page_content, metadata={"file_sha1": file_sha1,"file_size":file_size ,"file_name": file_name,
35
  "chunk_size": chunk_size, "chunk_overlap": chunk_overlap, "date": dateshort,
36
  "user" : st.session_state["username"]})
37
  for doc in documents]
38
 
39
+ try:
40
+ vector_store.add_documents(docs_with_metadata)
41
+ if stats_db:
42
+ add_usage(stats_db, "embedding", "file", metadata={"file_name": file_name,"file_type": file_suffix,
43
+ "chunk_size": chunk_size, "chunk_overlap": chunk_overlap})
44
+ except Exception as e:
45
+ print(f"Error adding documents to vector store:")
46
+ print(f"Exception: {str(e)}")
47
+ print(f"Input details:")
48
+ print(f"File name: {file_name}")
49
+ print(f"File size: {file_size}")
50
+ print(f"File SHA1: {file_sha1}")
51
+ print(f"Number of documents: {len(docs_with_metadata)}")
52
+ print(f"Chunk size: {chunk_size}")
53
+ print(f"Chunk overlap: {chunk_overlap}")
54
+ print(f"First document preview (truncated):")
55
+ if docs_with_metadata:
56
+ print(docs_with_metadata[0].page_content[:500])
57
+ raise # Re-raise the exception after logging