gourisankar85 commited on
Commit
ae1a8f1
·
verified ·
1 Parent(s): 1ac5ab9

Update retriever/embed_documents.py

Browse files
Files changed (1) hide show
  1. retriever/embed_documents.py +99 -98
retriever/embed_documents.py CHANGED
@@ -1,98 +1,99 @@
1
- '''import os
2
- import logging
3
- from langchain_huggingface import HuggingFaceEmbeddings
4
- from langchain_community.vectorstores import FAISS
5
-
6
- from config import ConfigConstants
7
-
8
- def embed_documents(documents, embedding_path="embeddings.faiss"):
9
- embedding_model = HuggingFaceEmbeddings(model_name=ConfigConstants.EMBEDDING_MODEL_NAME)
10
-
11
- if os.path.exists(embedding_path):
12
- logging.info("Loading embeddings from local file")
13
- vector_store = FAISS.load_local(embedding_path, embedding_model, allow_dangerous_deserialization=True)
14
- else:
15
- logging.info("Generating and saving embeddings")
16
- vector_store = FAISS.from_texts([doc['text'] for doc in documents], embedding_model)
17
- vector_store.save_local(embedding_path)
18
-
19
- return vector_store'''
20
-
21
- import os
22
- import logging
23
- import hashlib
24
- from typing import List, Dict
25
- from concurrent.futures import ThreadPoolExecutor
26
- from tqdm import tqdm
27
- from langchain_community.vectorstores import FAISS
28
- from langchain_huggingface import HuggingFaceEmbeddings
29
- from config import ConfigConstants
30
-
31
-
32
- def embed_documents(documents: List[Dict], embedding_path: str = ConfigConstants.DATA_SET_PATH + "embeddings/embeddings.faiss", metadata_path: str = ConfigConstants.DATA_SET_PATH + "embeddings/metadata.json") -> FAISS:
33
- logging.info(f"Total documents got :{len(documents)}")
34
- os.makedirs(os.path.dirname(embedding_path), exist_ok=True)
35
- os.makedirs(os.path.dirname(metadata_path), exist_ok=True)
36
- embedding_model = HuggingFaceEmbeddings(model_name=ConfigConstants.EMBEDDING_MODEL_NAME)
37
-
38
- if os.path.exists(embedding_path) and os.path.exists(metadata_path):
39
- logging.info("Loading embeddings and metadata from local files")
40
- vector_store = FAISS.load_local(embedding_path, embedding_model, allow_dangerous_deserialization=True)
41
- existing_metadata = _load_metadata(metadata_path)
42
- else:
43
- # Initialize FAISS with at least one document to avoid the IndexError
44
- if documents:
45
- vector_store = FAISS.from_texts([documents[0]['text']], embedding_model)
46
- else:
47
- # If no documents are provided, initialize an empty FAISS index with a dummy document
48
- vector_store = FAISS.from_texts(["dummy document"], embedding_model)
49
- existing_metadata = {}
50
-
51
- # Identify new or modified documents
52
- new_documents = []
53
- for doc in documents:
54
- doc_hash = _generate_document_hash(doc['text'])
55
- if doc_hash not in existing_metadata:
56
- new_documents.append(doc)
57
- existing_metadata[doc_hash] = True # Mark as processed
58
-
59
- if new_documents:
60
- logging.info(f"Generating embeddings for {len(new_documents)} new documents")
61
- with ThreadPoolExecutor() as executor:
62
- futures = []
63
- for doc in new_documents:
64
- futures.append(executor.submit(_embed_single_document, doc, embedding_model))
65
-
66
- for future in tqdm(futures, desc="Generating embeddings", unit="doc"):
67
- vector_store.add_texts([future.result()])
68
-
69
- # Save updated embeddings and metadata
70
- vector_store.save_local(embedding_path)
71
- _save_metadata(metadata_path, existing_metadata)
72
- else:
73
- logging.info("No new documents to process. Using existing embeddings.")
74
-
75
- return vector_store
76
-
77
- def _embed_single_document(doc: Dict, embedding_model: HuggingFaceEmbeddings) -> str:
78
- return doc['text']
79
-
80
- def _generate_document_hash(text: str) -> str:
81
- """Generate a unique hash for a document based on its text."""
82
- return hashlib.sha256(text.encode()).hexdigest()
83
-
84
- def _load_metadata(metadata_path: str) -> Dict[str, bool]:
85
- """Load metadata from a file."""
86
- import json
87
- if os.path.exists(metadata_path):
88
- with open(metadata_path, "r") as f:
89
- return json.load(f)
90
- return {}
91
-
92
- def _save_metadata(metadata_path: str, metadata: Dict[str, bool]):
93
- """Save metadata to a file."""
94
- import json
95
- with open(metadata_path, "w") as f:
96
- json.dump(metadata, f)
97
-
98
-
 
 
1
+ '''import os
2
+ import logging
3
+ from langchain_huggingface import HuggingFaceEmbeddings
4
+ from langchain_community.vectorstores import FAISS
5
+
6
+ from config import ConfigConstants
7
+
8
+ def embed_documents(documents, embedding_path="embeddings.faiss"):
9
+ embedding_model = HuggingFaceEmbeddings(model_name=ConfigConstants.EMBEDDING_MODEL_NAME)
10
+
11
+ if os.path.exists(embedding_path):
12
+ logging.info("Loading embeddings from local file")
13
+ vector_store = FAISS.load_local(embedding_path, embedding_model, allow_dangerous_deserialization=True)
14
+ else:
15
+ logging.info("Generating and saving embeddings")
16
+ vector_store = FAISS.from_texts([doc['text'] for doc in documents], embedding_model)
17
+ vector_store.save_local(embedding_path)
18
+
19
+ return vector_store'''
20
+
21
+ import os
22
+ import logging
23
+ import hashlib
24
+ from typing import List, Dict
25
+ from concurrent.futures import ThreadPoolExecutor
26
+ from tqdm import tqdm
27
+ from langchain_community.vectorstores import FAISS
28
+ from langchain_huggingface import HuggingFaceEmbeddings
29
+ from config import ConfigConstants
30
+
31
+
32
+ def embed_documents(documents: List[Dict], embedding_path: str = ConfigConstants.DATA_SET_PATH + "embeddings/embeddings.faiss", metadata_path: str = ConfigConstants.DATA_SET_PATH + "embeddings/metadata.json") -> FAISS:
33
+ logging.info(f"Total documents got :{len(documents)}")
34
+ os.makedirs(embedding_path, exist_ok=True)
35
+ os.makedirs(metadata_path, exist_ok=True)
36
+
37
+ embedding_model = HuggingFaceEmbeddings(model_name=ConfigConstants.EMBEDDING_MODEL_NAME)
38
+
39
+ if os.path.exists(embedding_path) and os.path.exists(metadata_path):
40
+ logging.info("Loading embeddings and metadata from local files")
41
+ vector_store = FAISS.load_local(embedding_path, embedding_model, allow_dangerous_deserialization=True)
42
+ existing_metadata = _load_metadata(metadata_path)
43
+ else:
44
+ # Initialize FAISS with at least one document to avoid the IndexError
45
+ if documents:
46
+ vector_store = FAISS.from_texts([documents[0]['text']], embedding_model)
47
+ else:
48
+ # If no documents are provided, initialize an empty FAISS index with a dummy document
49
+ vector_store = FAISS.from_texts(["dummy document"], embedding_model)
50
+ existing_metadata = {}
51
+
52
+ # Identify new or modified documents
53
+ new_documents = []
54
+ for doc in documents:
55
+ doc_hash = _generate_document_hash(doc['text'])
56
+ if doc_hash not in existing_metadata:
57
+ new_documents.append(doc)
58
+ existing_metadata[doc_hash] = True # Mark as processed
59
+
60
+ if new_documents:
61
+ logging.info(f"Generating embeddings for {len(new_documents)} new documents")
62
+ with ThreadPoolExecutor() as executor:
63
+ futures = []
64
+ for doc in new_documents:
65
+ futures.append(executor.submit(_embed_single_document, doc, embedding_model))
66
+
67
+ for future in tqdm(futures, desc="Generating embeddings", unit="doc"):
68
+ vector_store.add_texts([future.result()])
69
+
70
+ # Save updated embeddings and metadata
71
+ vector_store.save_local(embedding_path)
72
+ _save_metadata(metadata_path, existing_metadata)
73
+ else:
74
+ logging.info("No new documents to process. Using existing embeddings.")
75
+
76
+ return vector_store
77
+
78
+ def _embed_single_document(doc: Dict, embedding_model: HuggingFaceEmbeddings) -> str:
79
+ return doc['text']
80
+
81
+ def _generate_document_hash(text: str) -> str:
82
+ """Generate a unique hash for a document based on its text."""
83
+ return hashlib.sha256(text.encode()).hexdigest()
84
+
85
+ def _load_metadata(metadata_path: str) -> Dict[str, bool]:
86
+ """Load metadata from a file."""
87
+ import json
88
+ if os.path.exists(metadata_path):
89
+ with open(metadata_path, "r") as f:
90
+ return json.load(f)
91
+ return {}
92
+
93
+ def _save_metadata(metadata_path: str, metadata: Dict[str, bool]):
94
+ """Save metadata to a file."""
95
+ import json
96
+ with open(metadata_path, "w") as f:
97
+ json.dump(metadata, f)
98
+
99
+