Spaces:
Sleeping
Sleeping
Update functions.py
Browse files- functions.py +25 -33
functions.py
CHANGED
@@ -69,25 +69,22 @@ async def handle_userinput(user_question, custom_graph):
|
|
69 |
|
70 |
|
71 |
|
72 |
-
def create_retriever_from_chroma(vectorstore_path="./docs/chroma/", search_type='mmr', k=7, chunk_size=300, chunk_overlap=30,lambda_mult=
|
73 |
-
|
74 |
model_name = "Alibaba-NLP/gte-large-en-v1.5"
|
75 |
-
model_kwargs = {'device': 'cpu',
|
76 |
-
"trust_remote_code" : 'False'}
|
77 |
encode_kwargs = {'normalize_embeddings': True}
|
|
|
78 |
embeddings = HuggingFaceEmbeddings(
|
79 |
model_name=model_name,
|
80 |
model_kwargs=model_kwargs,
|
81 |
encode_kwargs=encode_kwargs
|
82 |
)
|
83 |
|
84 |
-
|
85 |
-
|
86 |
if os.path.exists(vectorstore_path) and os.listdir(vectorstore_path):
|
87 |
-
vectorstore = Chroma(persist_directory=vectorstore_path,embedding_function=embeddings)
|
88 |
-
|
89 |
else:
|
90 |
-
st.write("Vector store
|
|
|
91 |
urls = [
|
92 |
|
93 |
"https://github.com/zedr/clean-code-python",
|
@@ -190,38 +187,33 @@ def create_retriever_from_chroma(vectorstore_path="./docs/chroma/", search_type=
|
|
190 |
"https://datasciencedojo.com/blog/ensemble-methods-in-machine-learning/",
|
191 |
"https://datasciencedojo.com/blog/langgraph-tutorial/",
|
192 |
"https://datasciencedojo.com/blog/data-driven-marketing-in-2024/",
|
193 |
-
"https://datasciencedojo.com/blog/on-device-ai/"
|
194 |
-
|
195 |
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
|
|
|
|
202 |
|
|
|
203 |
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
is_separator_regex = True
|
211 |
-
)
|
212 |
-
split_docs = text_splitter.split_documents(docs)
|
213 |
|
214 |
-
|
215 |
-
vectorstore = Chroma.from_documents(
|
216 |
documents=split_docs, embedding=embeddings, persist_directory=vectorstore_path
|
217 |
)
|
218 |
-
|
219 |
-
|
220 |
-
retriever=vectorstore.as_retriever(search_type = search_type, search_kwargs={"k": k})
|
221 |
|
|
|
222 |
|
223 |
-
|
224 |
-
|
225 |
return retriever
|
226 |
|
227 |
|
|
|
69 |
|
70 |
|
71 |
|
72 |
+
def create_retriever_from_chroma(vectorstore_path="./docs/chroma/", search_type='mmr', k=7, chunk_size=300, chunk_overlap=30, lambda_mult=0.7):
|
|
|
73 |
model_name = "Alibaba-NLP/gte-large-en-v1.5"
|
74 |
+
model_kwargs = {'device': 'cpu', "trust_remote_code": 'False'}
|
|
|
75 |
encode_kwargs = {'normalize_embeddings': True}
|
76 |
+
|
77 |
embeddings = HuggingFaceEmbeddings(
|
78 |
model_name=model_name,
|
79 |
model_kwargs=model_kwargs,
|
80 |
encode_kwargs=encode_kwargs
|
81 |
)
|
82 |
|
|
|
|
|
83 |
if os.path.exists(vectorstore_path) and os.listdir(vectorstore_path):
|
84 |
+
vectorstore = Chroma(persist_directory=vectorstore_path, embedding_function=embeddings)
|
|
|
85 |
else:
|
86 |
+
st.write("Vector store doesn't exist and will be created now")
|
87 |
+
|
88 |
urls = [
|
89 |
|
90 |
"https://github.com/zedr/clean-code-python",
|
|
|
187 |
"https://datasciencedojo.com/blog/ensemble-methods-in-machine-learning/",
|
188 |
"https://datasciencedojo.com/blog/langgraph-tutorial/",
|
189 |
"https://datasciencedojo.com/blog/data-driven-marketing-in-2024/",
|
190 |
+
"https://datasciencedojo.com/blog/on-device-ai/",
|
191 |
+
|
192 |
|
193 |
+
]
|
194 |
+
|
195 |
+
def extract_sentences_from_web(links, chunk_size=500, chunk_overlap=30):
|
196 |
+
data = []
|
197 |
+
for link in links:
|
198 |
+
loader = NewsURLLoader(urls=[link])
|
199 |
+
data += loader.load()
|
200 |
+
return data
|
201 |
|
202 |
+
docs = extract_sentences_from_web(links=urls)
|
203 |
|
204 |
+
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
|
205 |
+
chunk_size=chunk_size, chunk_overlap=chunk_overlap,
|
206 |
+
separators=["\n\n \n\n", "\n\n\n", "\n\n", r"In \[[0-9]+\]", r"\n+", r"\s+"],
|
207 |
+
is_separator_regex=True
|
208 |
+
)
|
209 |
+
split_docs = text_splitter.split_documents(docs)
|
|
|
|
|
|
|
210 |
|
211 |
+
vectorstore = Chroma.from_documents(
|
|
|
212 |
documents=split_docs, embedding=embeddings, persist_directory=vectorstore_path
|
213 |
)
|
|
|
|
|
|
|
214 |
|
215 |
+
retriever = vectorstore.as_retriever(search_type=search_type, search_kwargs={"k": k})
|
216 |
|
|
|
|
|
217 |
return retriever
|
218 |
|
219 |
|