marksverdhei commited on
Commit
bdef5c4
·
1 Parent(s): bbb7e28

Make working embedding view

Browse files
Files changed (3) hide show
  1. app.py +1 -1
  2. resources.py +21 -2
  3. views.py +34 -5
app.py CHANGED
@@ -44,4 +44,4 @@ with tab1:
44
  )
45
 
46
  with tab2:
47
- views.diffs(embeddings, corrector)
 
44
  )
45
 
46
  with tab2:
47
+ views.diffs(embeddings, corrector, encoder, tokenizer)
resources.py CHANGED
@@ -1,10 +1,11 @@
1
  import streamlit as st
2
  import pandas as pd
 
3
  import vec2text
4
  from transformers import AutoModel, AutoTokenizer
5
  from sklearn.decomposition import PCA
6
  from utils import file_cache
7
-
8
 
9
  # Caching the vec2text corrector
10
  @st.cache_resource
@@ -35,4 +36,22 @@ def reduce_embeddings(embeddings):
35
  def load_model_and_tokenizer(device="cpu"):
36
  encoder = AutoModel.from_pretrained("sentence-transformers/gtr-t5-base").encoder.to(device)
37
  tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/gtr-t5-base")
38
- return encoder, tokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  import pandas as pd
3
+ import torch
4
  import vec2text
5
  from transformers import AutoModel, AutoTokenizer
6
  from sklearn.decomposition import PCA
7
  from utils import file_cache
8
+ from transformers import PreTrainedModel, PreTrainedTokenizer
9
 
10
  # Caching the vec2text corrector
11
  @st.cache_resource
 
36
  def load_model_and_tokenizer(device="cpu"):
37
  encoder = AutoModel.from_pretrained("sentence-transformers/gtr-t5-base").encoder.to(device)
38
  tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/gtr-t5-base")
39
+ return encoder, tokenizer
40
+
41
+
42
+ def get_gtr_embeddings(text_list,
43
+ encoder: PreTrainedModel,
44
+ tokenizer: PreTrainedTokenizer) -> torch.Tensor:
45
+
46
+ inputs = tokenizer(text_list,
47
+ return_tensors="pt",
48
+ max_length=128,
49
+ truncation=True,
50
+ padding="max_length",).to("cuda")
51
+
52
+ with torch.no_grad():
53
+ model_output = encoder(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
54
+ hidden_state = model_output.last_hidden_state
55
+ embeddings = vec2text.models.model_utils.mean_pool(hidden_state, inputs['attention_mask'])
56
+
57
+ return embeddings
views.py CHANGED
@@ -5,16 +5,45 @@ from umap import UMAP
5
  import plotly.express as px
6
  import numpy as np
7
  from streamlit_plotly_events import plotly_events
8
- from resources import reduce_embeddings
9
  import utils
10
  import pandas as pd
11
  from scipy.spatial import distance
12
 
13
  dimensionality_reduction_model_name = "PCA"
14
 
15
- def diffs(embeddings: np.ndarray, corrector):
16
- st.text(f"Embedding shape: {embeddings.shape}")
17
- st.html('<a href="https://www.flaticon.com/free-icons/array" title="array icons">Array icons created by Voysla - Flaticon</a>')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  def plot(df: pd.DataFrame, embeddings: np.ndarray, vectors_2d, reducer, corrector):
20
 
@@ -88,7 +117,7 @@ def plot(df: pd.DataFrame, embeddings: np.ndarray, vectors_2d, reducer, correcto
88
 
89
  if inferred_embedding is not None and (closest_sentence_index != -1):
90
  couple = selected_sentence_embedding.squeeze(), inferred_embedding.squeeze()
91
- st.markdown(f"### Inferred embedding distance:")
92
  st.number_input("Euclidean", value=distance.euclidean(
93
  *couple
94
  ), disabled=True)
 
5
  import plotly.express as px
6
  import numpy as np
7
  from streamlit_plotly_events import plotly_events
 
8
  import utils
9
  import pandas as pd
10
  from scipy.spatial import distance
11
 
12
  dimensionality_reduction_model_name = "PCA"
13
 
14
+ def diffs(embeddings: np.ndarray, corrector, encoder, tokenizer):
15
+ st.title('"A man is to king, what woman is to queen"')
16
+ st.markdown("A well known pehnomenon in semantic vectors is the way we can do vector operations like addition and subtraction to find spacial relations in the vector space.")
17
+ st.markdown(
18
+ 'In word embedding models, we have found that the relationship between words can be captured mathematically, '
19
+ 'such that "king" is to "man" as "queen" is to "woman," demonstrating that vector arithmetic can encode analogies and semantic relationships in high-dimensional space ([Mikolov et al., 2013](https://arxiv.org/abs/1301.3781)).'
20
+ )
21
+ st.markdown("This application lets you freely explore to which extent that property applies to embedding inversion models given the other factors of inaccuracy")
22
+
23
+ generated_sentence = ""
24
+
25
+
26
+ with st.form(key="foo") as form:
27
+ submit_button = st.form_submit_button("Synthesize")
28
+
29
+ sent1 = st.text_input("Sentence 1")
30
+ st.latex("-")
31
+ sent2 = st.text_input("Sentence 2")
32
+ st.latex("+")
33
+ sent3 = st.text_input("Sentence 3")
34
+ st.latex("=")
35
+
36
+ if submit_button:
37
+ generated_sentence = "HI"
38
+
39
+ sent4 = st.text_input("Sentence 4", value=generated_sentence, disabled=True)
40
+
41
+
42
+
43
+ if submit_button:
44
+ generated_sentence = "HI!"
45
+
46
+ # st.html('<a href="https://www.flaticon.com/free-icons/array" title="array icons">Array icons created by Voysla - Flaticon</a>')
47
 
48
  def plot(df: pd.DataFrame, embeddings: np.ndarray, vectors_2d, reducer, corrector):
49
 
 
117
 
118
  if inferred_embedding is not None and (closest_sentence_index != -1):
119
  couple = selected_sentence_embedding.squeeze(), inferred_embedding.squeeze()
120
+ st.markdown("### Inferred embedding distance:")
121
  st.number_input("Euclidean", value=distance.euclidean(
122
  *couple
123
  ), disabled=True)