Commit
·
bdef5c4
1
Parent(s):
bbb7e28
Make working embedding view
Browse files- app.py +1 -1
- resources.py +21 -2
- views.py +34 -5
app.py
CHANGED
@@ -44,4 +44,4 @@ with tab1:
|
|
44 |
)
|
45 |
|
46 |
with tab2:
|
47 |
-
views.diffs(embeddings, corrector)
|
|
|
44 |
)
|
45 |
|
46 |
with tab2:
|
47 |
+
views.diffs(embeddings, corrector, encoder, tokenizer)
|
resources.py
CHANGED
@@ -1,10 +1,11 @@
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
|
|
3 |
import vec2text
|
4 |
from transformers import AutoModel, AutoTokenizer
|
5 |
from sklearn.decomposition import PCA
|
6 |
from utils import file_cache
|
7 |
-
|
8 |
|
9 |
# Caching the vec2text corrector
|
10 |
@st.cache_resource
|
@@ -35,4 +36,22 @@ def reduce_embeddings(embeddings):
|
|
35 |
def load_model_and_tokenizer(device="cpu"):
|
36 |
encoder = AutoModel.from_pretrained("sentence-transformers/gtr-t5-base").encoder.to(device)
|
37 |
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/gtr-t5-base")
|
38 |
-
return encoder, tokenizer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
+
import torch
|
4 |
import vec2text
|
5 |
from transformers import AutoModel, AutoTokenizer
|
6 |
from sklearn.decomposition import PCA
|
7 |
from utils import file_cache
|
8 |
+
from transformers import PreTrainedModel, PreTrainedTokenizer
|
9 |
|
10 |
# Caching the vec2text corrector
|
11 |
@st.cache_resource
|
|
|
36 |
def load_model_and_tokenizer(device="cpu"):
|
37 |
encoder = AutoModel.from_pretrained("sentence-transformers/gtr-t5-base").encoder.to(device)
|
38 |
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/gtr-t5-base")
|
39 |
+
return encoder, tokenizer
|
40 |
+
|
41 |
+
|
42 |
+
def get_gtr_embeddings(text_list,
|
43 |
+
encoder: PreTrainedModel,
|
44 |
+
tokenizer: PreTrainedTokenizer) -> torch.Tensor:
|
45 |
+
|
46 |
+
inputs = tokenizer(text_list,
|
47 |
+
return_tensors="pt",
|
48 |
+
max_length=128,
|
49 |
+
truncation=True,
|
50 |
+
padding="max_length",).to("cuda")
|
51 |
+
|
52 |
+
with torch.no_grad():
|
53 |
+
model_output = encoder(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
|
54 |
+
hidden_state = model_output.last_hidden_state
|
55 |
+
embeddings = vec2text.models.model_utils.mean_pool(hidden_state, inputs['attention_mask'])
|
56 |
+
|
57 |
+
return embeddings
|
views.py
CHANGED
@@ -5,16 +5,45 @@ from umap import UMAP
|
|
5 |
import plotly.express as px
|
6 |
import numpy as np
|
7 |
from streamlit_plotly_events import plotly_events
|
8 |
-
from resources import reduce_embeddings
|
9 |
import utils
|
10 |
import pandas as pd
|
11 |
from scipy.spatial import distance
|
12 |
|
13 |
dimensionality_reduction_model_name = "PCA"
|
14 |
|
15 |
-
def diffs(embeddings: np.ndarray, corrector):
|
16 |
-
st.
|
17 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
def plot(df: pd.DataFrame, embeddings: np.ndarray, vectors_2d, reducer, corrector):
|
20 |
|
@@ -88,7 +117,7 @@ def plot(df: pd.DataFrame, embeddings: np.ndarray, vectors_2d, reducer, correcto
|
|
88 |
|
89 |
if inferred_embedding is not None and (closest_sentence_index != -1):
|
90 |
couple = selected_sentence_embedding.squeeze(), inferred_embedding.squeeze()
|
91 |
-
st.markdown(
|
92 |
st.number_input("Euclidean", value=distance.euclidean(
|
93 |
*couple
|
94 |
), disabled=True)
|
|
|
5 |
import plotly.express as px
|
6 |
import numpy as np
|
7 |
from streamlit_plotly_events import plotly_events
|
|
|
8 |
import utils
|
9 |
import pandas as pd
|
10 |
from scipy.spatial import distance
|
11 |
|
12 |
dimensionality_reduction_model_name = "PCA"
|
13 |
|
14 |
+
def diffs(embeddings: np.ndarray, corrector, encoder, tokenizer):
|
15 |
+
st.title('"A man is to king, what woman is to queen"')
|
16 |
+
st.markdown("A well known pehnomenon in semantic vectors is the way we can do vector operations like addition and subtraction to find spacial relations in the vector space.")
|
17 |
+
st.markdown(
|
18 |
+
'In word embedding models, we have found that the relationship between words can be captured mathematically, '
|
19 |
+
'such that "king" is to "man" as "queen" is to "woman," demonstrating that vector arithmetic can encode analogies and semantic relationships in high-dimensional space ([Mikolov et al., 2013](https://arxiv.org/abs/1301.3781)).'
|
20 |
+
)
|
21 |
+
st.markdown("This application lets you freely explore to which extent that property applies to embedding inversion models given the other factors of inaccuracy")
|
22 |
+
|
23 |
+
generated_sentence = ""
|
24 |
+
|
25 |
+
|
26 |
+
with st.form(key="foo") as form:
|
27 |
+
submit_button = st.form_submit_button("Synthesize")
|
28 |
+
|
29 |
+
sent1 = st.text_input("Sentence 1")
|
30 |
+
st.latex("-")
|
31 |
+
sent2 = st.text_input("Sentence 2")
|
32 |
+
st.latex("+")
|
33 |
+
sent3 = st.text_input("Sentence 3")
|
34 |
+
st.latex("=")
|
35 |
+
|
36 |
+
if submit_button:
|
37 |
+
generated_sentence = "HI"
|
38 |
+
|
39 |
+
sent4 = st.text_input("Sentence 4", value=generated_sentence, disabled=True)
|
40 |
+
|
41 |
+
|
42 |
+
|
43 |
+
if submit_button:
|
44 |
+
generated_sentence = "HI!"
|
45 |
+
|
46 |
+
# st.html('<a href="https://www.flaticon.com/free-icons/array" title="array icons">Array icons created by Voysla - Flaticon</a>')
|
47 |
|
48 |
def plot(df: pd.DataFrame, embeddings: np.ndarray, vectors_2d, reducer, corrector):
|
49 |
|
|
|
117 |
|
118 |
if inferred_embedding is not None and (closest_sentence_index != -1):
|
119 |
couple = selected_sentence_embedding.squeeze(), inferred_embedding.squeeze()
|
120 |
+
st.markdown("### Inferred embedding distance:")
|
121 |
st.number_input("Euclidean", value=distance.euclidean(
|
122 |
*couple
|
123 |
), disabled=True)
|