gaodrew commited on
Commit
0ee478c
·
1 Parent(s): dada911

Upload comparevec2vecwithada.py

Browse files
Files changed (1) hide show
  1. comparevec2vecwithada.py +82 -0
comparevec2vecwithada.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """compareVec2VecWithAda.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1jPaNXdO0_oW6VczlWfm5RPUVpMtVQD9c
8
+ """
9
+
10
+ import pandas as pd
11
+ import numpy as np
12
+ import openai
13
+ from sklearn.metrics.pairwise import cosine_similarity
14
+ from tensorflow.keras.models import load_model
15
+ from transformers import AutoTokenizer, AutoModel
16
+ import torch
17
+ import torch.nn.functional as F
18
+
19
+ # Load model (available from Hugging Face)
20
+ tokenizer = AutoTokenizer.from_pretrained('all-mpnet-base-v2')
21
+ model = AutoModel.from_pretrained('all-mpnet-base-v2')
22
+
23
+ # Define cosine similarity loss
24
+ def cosine_similarity_loss(y_true, y_pred):
25
+ y_true = tf.nn.l2_normalize(y_true, axis=-1)
26
+ y_pred = tf.nn.l2_normalize(y_pred, axis=-1)
27
+ return -tf.reduce_mean(y_true * y_pred, axis=-1)
28
+
29
+
30
+ #Mean Pooling - Take attention mask into account for correct averaging
31
+ def mean_pooling(model_output, attention_mask):
32
+ token_embeddings = model_output[0] #First element of model_output contains all token embeddings
33
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
34
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
35
+
36
+ loaded_model = load_model('mpnet2adaE75V4.h5', custom_objects={'cosine_similarity_loss': cosine_similarity_loss})
37
+
38
+ openai.api_key="insert API key here"
39
+
40
+ # load in csv of 10,000 embeddings in our test set paired with the original reviews
41
+ df2 = pd.read_csv('Actual_Embeddings.csv')
42
+
43
+ # Convert strings of lists to numpy arrays. this takes a while
44
+ df2['Actual_Embeddings'] = df2['Actual_Embeddings'].apply(eval).apply(np.array)
45
+
46
+
47
+ def get_top_5_texts(query):
48
+ encoded_input = tokenizer(query, padding=True, truncation=True, return_tensors='pt')
49
+
50
+ with torch.no_grad():
51
+ model_output = model(**encoded_input)
52
+
53
+ mpnetEmbeddings = mean_pooling(model_output, encoded_input['attention_mask'])
54
+
55
+ mpnetEmbeddings = F.normalize(mpnetEmbeddings, p=2, dim=1)
56
+ mpnetEmbeddings = mpnetEmbeddings.detach().cpu().numpy()
57
+ mpnetEmbeddings = np.reshape(mpnetEmbeddings, (1,-1))
58
+ query_embedding = loaded_model.predict(mpnetEmbeddings)
59
+
60
+ similarities = [cosine_similarity(query_embedding.reshape(1, -1), emb.reshape(1, -1))[0][0] for emb in df2['Actual_Embeddings']]
61
+
62
+ print("Converted MPNet Embedding Results:")
63
+ top_5_idx2 = np.argsort(similarities)[-5:][::-1]
64
+ for i, idx in enumerate(top_5_idx2, 1):
65
+ print(f'Text {i}')
66
+ print(df2['combined'].iloc[idx])
67
+ print("\n")
68
+
69
+ response = openai.Embedding.create(input=query, model="text-embedding-ada-002")
70
+ query_embedding = np.array(response['data'][0]['embedding'])
71
+ similarities2 = [cosine_similarity(query_embedding.reshape(1, -1), emb.reshape(1, -1))[0][0] for emb in df2['Actual_Embeddings']]
72
+
73
+ print("OpenAI Embedding Results:")
74
+ top_5_idx2 = np.argsort(similarities2)[-5:][::-1]
75
+ for i, idx in enumerate(top_5_idx2, 1):
76
+ print(f'Text {i}')
77
+ print(df2['combined'].iloc[idx])
78
+ print("\n")
79
+
80
+ while True:
81
+ query = input("Enter your query: ")
82
+ get_top_5_texts(query)