Spaces:

MarioBarbeque
/

VanderbiltGlossary

Sleeping

App Files Files Community

John Graham Reynolds commited on Nov 10, 2024

Commit

8bb66b9

1 Parent(s): 7f097f8

update app to use Langchain for retreival

Browse files

Files changed (1) hide show

app.py +31 -15

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
-import streamlit as st
 import os
-from mlflow import deployments
-from databricks.vector_search.client import VectorSearchClient
 DATABRICKS_HOST = os.environ.get("DATABRICKS_HOST")
 DATABRICKS_API_TOKEN = os.environ.get("DATABRICKS_API_TOKEN")
@@ -39,24 +39,40 @@ st.markdown("\n")
 # with open("style.css") as css:
 #     st.markdown( f'<style>{css.read()}</style>' , unsafe_allow_html= True)
 # TODO *** configure to run only on prompt for verification?
-vsc = VectorSearchClient()
-question = "What is the data lake?"
-# question_2 = "What does EDW stand for?"
-# question_3 = "What does AIDET stand for?"
-deploy_client = deployments.get_deploy_client("databricks")
-response = deploy_client.predict(endpoint="databricks-bge-large-en", inputs={"input": [question]})
-embeddings = [e['embedding'] for e in response.data]
-results = vsc.get_index(VS_ENDPOINT_NAME, VS_INDEX_NAME).similarity_search(
-  query_vector=embeddings[0],
-  columns=["name", "description"],
-  num_results=5)
-st.write(results)
 # print(results)

 import os
+import streamlit as st
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_databricks.vectorstores import DatabricksVectorSearch
 DATABRICKS_HOST = os.environ.get("DATABRICKS_HOST")
 DATABRICKS_API_TOKEN = os.environ.get("DATABRICKS_API_TOKEN")
 # with open("style.css") as css:
 #     st.markdown( f'<style>{css.read()}</style>' , unsafe_allow_html= True)
+# Same embedding model we used to create embeddings of terms
+# make sure we cache this so that it doesnt redownload each time, hindering Space start time if sleeping
+embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en", cache_folder="./langchain_cache/")
+vector_store = DatabricksVectorSearch(
+    endpoint=VS_ENDPOINT_NAME,
+    index_name=VS_INDEX_NAME,
+    embedding=embeddings,
+    text_column="name",
+    columns=["name", "description"],
+)
+results = vector_store.similarity_search(query="Tell me about what a data lake is.", k=5)
+st.write(results)
 # TODO *** configure to run only on prompt for verification?
+# vsc = VectorSearchClient()
+# question = "What is the data lake?"
+# # question_2 = "What does EDW stand for?"
+# # question_3 = "What does AIDET stand for?"
+# deploy_client = deployments.get_deploy_client("databricks")
+# response = deploy_client.predict(endpoint="databricks-bge-large-en", inputs={"input": [question]})
+# embeddings = [e['embedding'] for e in response.data]
+# results = vsc.get_index(VS_ENDPOINT_NAME, VS_INDEX_NAME).similarity_search(
+#   query_vector=embeddings[0],
+#   columns=["name", "description"],
+#   num_results=5)
+# st.write(results)
 # print(results)