Spaces:
Runtime error
Runtime error
im
commited on
Commit
·
92b778c
1
Parent(s):
a6a7ec1
add vector database 3d space visualisation
Browse files
app.py
CHANGED
@@ -548,6 +548,7 @@ st.plotly_chart(fig, use_container_width=True)
|
|
548 |
with st.expander("Python Code:"):
|
549 |
st.code(f"""\
|
550 |
import openai
|
|
|
551 |
|
552 |
EMBEDDING_MODEL = 'text-embedding-ada-002'
|
553 |
|
@@ -582,7 +583,84 @@ fig.update_layout(coloraxis_showscale=False)
|
|
582 |
fig.update_layout(width=6000)
|
583 |
st.plotly_chart(fig, use_container_width=True)
|
584 |
|
585 |
-
st.subheader("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
586 |
|
587 |
from langchain.embeddings.openai import OpenAIEmbeddings
|
588 |
from langchain.vectorstores import FAISS
|
@@ -599,13 +677,6 @@ def search_vector_database(term):
|
|
599 |
docs = db.similarity_search_by_vector(embedding_vector)
|
600 |
return docs
|
601 |
|
602 |
-
st.write("""\
|
603 |
-
*There is a vector database containing two words: 'king' and 'queen'. Your task is to pinpoint search
|
604 |
-
terms that would yield either of these words. To facilitate this, use the previously presented similarity matrix to
|
605 |
-
seek out words that give a higher correlation with the word in question. For instance, you might want to explore
|
606 |
-
terms such as 'king', 'queen', 'dog', 'prince', 'man', 'minister', 'boy'.*
|
607 |
-
""")
|
608 |
-
embeddings_query = st.text_input(label="search term")
|
609 |
if embeddings_query is not None and embeddings_query != '':
|
610 |
docs = search_vector_database(embeddings_query)
|
611 |
st.warning(docs[0].page_content)
|
@@ -623,7 +694,7 @@ if embeddings_query is not None and embeddings_query != '':
|
|
623 |
""")
|
624 |
|
625 |
divider()
|
626 |
-
st.
|
627 |
st.write("""\
|
628 |
As embedding algorithms are trained on a vast corpus of data, they inherently encapsulate a rich
|
629 |
tapestry of information about our language and even the world at large. Therefore, they can be used for:
|
@@ -643,10 +714,11 @@ with st.expander("References:"):
|
|
643 |
- https://platform.openai.com/docs/guides/embeddings/use-cases
|
644 |
""")
|
645 |
|
|
|
|
|
646 |
divider()
|
647 |
st.header("Dimensionality Reduction (optional)")
|
648 |
|
649 |
-
|
650 |
st.write("""\
|
651 |
As was mentioned above, embedding vectors are learned in such a way that words with similar meanings
|
652 |
are located close to each other in the space. However, this is an abstract concept that might be difficult to
|
@@ -728,7 +800,7 @@ elif dimensionality_name == 'PCA':
|
|
728 |
""")
|
729 |
embedding_dim = 1536
|
730 |
embeddings = st.text_input("words to explore:",
|
731 |
-
value="king queen man woman prince
|
732 |
embeddings = embeddings.split()
|
733 |
embeddings = {word: get_embeddings(word) for word in embeddings}
|
734 |
|
@@ -787,7 +859,7 @@ elif dimensionality_name == 't-SNE':
|
|
787 |
""")
|
788 |
embedding_dim = 1536
|
789 |
embeddings = st.text_input("words to explore:",
|
790 |
-
value="king queen man woman prince
|
791 |
embeddings = embeddings.split()
|
792 |
embeddings = {word: get_embeddings(word) for word in embeddings}
|
793 |
|
|
|
548 |
with st.expander("Python Code:"):
|
549 |
st.code(f"""\
|
550 |
import openai
|
551 |
+
import numpy as np
|
552 |
|
553 |
EMBEDDING_MODEL = 'text-embedding-ada-002'
|
554 |
|
|
|
583 |
fig.update_layout(width=6000)
|
584 |
st.plotly_chart(fig, use_container_width=True)
|
585 |
|
586 |
+
st.subheader("Vector Databases")
|
587 |
+
st.write("""\
|
588 |
+
In a vector database, each item (e.g., a document) is represented as a point in a multidimensional
|
589 |
+
space. Each point is a vector that represents the features of the item. The goal is to place similar items close to
|
590 |
+
each other and dissimilar items farther apart. In the case of documents, the features could be derived from the words
|
591 |
+
in the document, and the similarity might be based on the overlapping words or concepts between the documents.
|
592 |
+
|
593 |
+
The retrieval of documents based on search terms involves two main steps:
|
594 |
+
|
595 |
+
Vectorization of the search query: The search query is converted into a vector using the same process used to vectorize the documents in the database.
|
596 |
+
|
597 |
+
Vector similarity search: The vector database then identifies the vectors that are closest to the query vector.
|
598 |
+
This is typically done using a distance metric like Euclidean distance or cosine similarity. The documents
|
599 |
+
corresponding to these vectors are returned as the search results.
|
600 |
+
|
601 |
+
As you correctly assumed, we leverage embedding algorithms to vectorise documents. Let's generate a 3D
|
602 |
+
visualization of the document vectors and a search query. For simplicity, let's assume we have a vector database
|
603 |
+
of documents that has been reduced to 3 dimensions, and we'll also have a 3D vector for a search query.
|
604 |
+
|
605 |
+
""")
|
606 |
+
with st.expander("The Euclidean distance between two points in 3D space is calculated as:"):
|
607 |
+
st.latex("""\\text{Distance}(A(x_1, y_1, z_1), B(x_2, y_2, z_2)) = \sqrt{(x_2 - x_1)^2 + (y_2 - y_1)^2 + (z_2 - z_1)^2}""")
|
608 |
+
st.write("""\
|
609 |
+
The document that corresponds to the vector with the smallest distance to the query vector is
|
610 |
+
considered the most relevant document. The 3D plot above now shows lines from the query vector (in red) to each
|
611 |
+
document vector (in blue). Each line represents the Euclidean distance from the query vector to a document vector.
|
612 |
+
""")
|
613 |
+
embeddings = st.text_input("vector space:", value="king queen prince princess counselor minister teacher")
|
614 |
+
embeddings = embeddings.split()
|
615 |
+
embeddings_query = st.text_input(label="search term", value='woman')
|
616 |
+
|
617 |
+
import numpy as np
|
618 |
+
import plotly.express as px
|
619 |
+
import plotly.graph_objects as go
|
620 |
+
from sklearn.manifold import TSNE
|
621 |
+
|
622 |
+
embeddings = {word: get_embeddings(word) for word in embeddings}
|
623 |
+
embeddings[embeddings_query] = get_embeddings(embeddings_query)
|
624 |
+
|
625 |
+
tsne = TSNE(n_components=3, perplexity=3, random_state=0)
|
626 |
+
embedding_matrix = np.array(list(embeddings.values()))
|
627 |
+
reduced_embeddings = tsne.fit_transform(embedding_matrix)
|
628 |
+
|
629 |
+
df = pd.DataFrame(reduced_embeddings, columns=["X", "Y", "Z"])
|
630 |
+
df["Word"] = list(embeddings.keys())
|
631 |
+
fig = px.scatter_3d(df, x="X", y="Y", z="Z", text="Word", title="Vector Space", width=800, height=800)
|
632 |
+
|
633 |
+
|
634 |
+
docs = reduced_embeddings[:-1]
|
635 |
+
query = reduced_embeddings[-1]
|
636 |
+
distances = np.linalg.norm(docs - query, axis=1)
|
637 |
+
closest_doc_index = np.argmin(distances)
|
638 |
+
closest_doc = docs[closest_doc_index]
|
639 |
+
|
640 |
+
for doc in docs:
|
641 |
+
fig.add_trace(go.Scatter3d(x=[query[0], doc[0]], y=[query[1], doc[1]], z=[query[2], doc[2]], mode='lines', line=dict(color='purple', width=2, dash='dash')))
|
642 |
+
fig.add_trace(go.Scatter3d(x=[query[0], closest_doc[0]], y=[query[1], closest_doc[1]], z=[query[2], closest_doc[2]], name='closest', mode='lines', line=dict(color='purple', width=2)))
|
643 |
+
|
644 |
+
st.plotly_chart(fig, use_container_width=True)
|
645 |
+
|
646 |
+
st.write("""\
|
647 |
+
This visualization represents the core concept of a vector database search. The database converts the
|
648 |
+
search query into a vector, then finds the document vectors that are closest to the query vector. Those documents are
|
649 |
+
considered the most relevant to the search query.
|
650 |
+
|
651 |
+
It's important to note that in a real-world application, the vectors would likely exist in much higher dimensional
|
652 |
+
space. However, the same principles apply: the search algorithm finds the document vectors that are nearest to the
|
653 |
+
query vector based on some distance metric.
|
654 |
+
""")
|
655 |
+
st.subheader(":green[Try Yourself]")
|
656 |
+
|
657 |
+
st.write("""\
|
658 |
+
*There is a vector database containing two words (documents): 'king' and 'queen'. Your task is to pinpoint search
|
659 |
+
terms that would yield either of these words. To facilitate this, use the previously presented similarity matrix to
|
660 |
+
seek out words that give a higher correlation with the word in question. For instance, you might want to explore
|
661 |
+
terms such as 'king', 'queen', 'dog', 'prince', 'man', 'minister', 'boy'.*
|
662 |
+
""")
|
663 |
+
embeddings_query = st.text_input(label="search term")
|
664 |
|
665 |
from langchain.embeddings.openai import OpenAIEmbeddings
|
666 |
from langchain.vectorstores import FAISS
|
|
|
677 |
docs = db.similarity_search_by_vector(embedding_vector)
|
678 |
return docs
|
679 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
680 |
if embeddings_query is not None and embeddings_query != '':
|
681 |
docs = search_vector_database(embeddings_query)
|
682 |
st.warning(docs[0].page_content)
|
|
|
694 |
""")
|
695 |
|
696 |
divider()
|
697 |
+
st.subheader("Conclusion")
|
698 |
st.write("""\
|
699 |
As embedding algorithms are trained on a vast corpus of data, they inherently encapsulate a rich
|
700 |
tapestry of information about our language and even the world at large. Therefore, they can be used for:
|
|
|
714 |
- https://platform.openai.com/docs/guides/embeddings/use-cases
|
715 |
""")
|
716 |
|
717 |
+
|
718 |
+
# *********************************************
|
719 |
divider()
|
720 |
st.header("Dimensionality Reduction (optional)")
|
721 |
|
|
|
722 |
st.write("""\
|
723 |
As was mentioned above, embedding vectors are learned in such a way that words with similar meanings
|
724 |
are located close to each other in the space. However, this is an abstract concept that might be difficult to
|
|
|
800 |
""")
|
801 |
embedding_dim = 1536
|
802 |
embeddings = st.text_input("words to explore:",
|
803 |
+
value="king queen man woman prince princess counselor minister teacher")
|
804 |
embeddings = embeddings.split()
|
805 |
embeddings = {word: get_embeddings(word) for word in embeddings}
|
806 |
|
|
|
859 |
""")
|
860 |
embedding_dim = 1536
|
861 |
embeddings = st.text_input("words to explore:",
|
862 |
+
value="king queen man woman prince princess counselor minister teacher")
|
863 |
embeddings = embeddings.split()
|
864 |
embeddings = {word: get_embeddings(word) for word in embeddings}
|
865 |
|