EmicoBinsfinder commited on
Commit
fb737cc
·
1 Parent(s): 04fbf3e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -0
app.py CHANGED
@@ -35,10 +35,12 @@ model = LlamaForCausalLM.from_pretrained(
35
  device_map="auto") #low_cpu_mem_usage=True)
36
 
37
  ########## DEFINING FUNCTIONS ###################
 
38
  def mean_pooling(model_output, attention_mask):
39
  token_embeddings = model_output[0]
40
  input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
41
  return tf.reduce_sum(token_embeddings * input_mask_expanded, 1) / tf.clip_by_value(input_mask_expanded.sum(1), clip_value_min=1e-9, clip_value_max=math.inf)
 
42
  def broad_scope_class_predictor(class_embeddings, abstract_embedding, N=5, Sensitivity='Medium'):
43
  predictions = pd.DataFrame(columns=['Class Name', 'Score'])
44
  for i in range(len(class_embeddings)):
@@ -68,6 +70,7 @@ def broad_scope_class_predictor(class_embeddings, abstract_embedding, N=5, Sensi
68
  HighestSimilarity = predictions.nlargest(N, ['Score'])
69
  return HighestSimilarity
70
 
 
71
  def sentence_embedder(sentences, model_path):
72
  tokenizer = AutoTokenizer.from_pretrained(model_path) #instantiating the sentence embedder using HuggingFace library
73
  model = AutoModel.from_pretrained(model_path, from_tf=True) #making a model instance
@@ -78,6 +81,7 @@ def sentence_embedder(sentences, model_path):
78
  sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']) #outputs a (1, 384) tensor representation of input text
79
  return sentence_embeddings
80
 
 
81
  def add_text(history, text):
82
  history = history + [(text, None)]
83
  return history, ""
@@ -86,6 +90,26 @@ def add_file(history, file):
86
  history = history + [((file.name,), None)]
87
  return history
88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  def bot(history):
90
  response = "**That's cool!**"
91
  history[-1][1] = response
@@ -93,6 +117,7 @@ def bot(history):
93
 
94
 
95
  ########## LOADING PRE-COMPUTED EMBEDDINGS ##########
 
96
  def clean_data(input, type='Dataframe'):
97
  if type == 'Dataframe':
98
  cleaneddf = pd.DataFrame(columns=['Class', 'Description'])
@@ -157,6 +182,7 @@ def clean_data(input, type='Dataframe'):
157
  return row
158
 
159
 
 
160
  def classifier(userin):
161
  clean_in = clean_data(userin, type='String')
162
  in_emb = sentence_embedder(clean_in, 'Model_bert')
@@ -166,6 +192,7 @@ def classifier(userin):
166
 
167
  return broad_scope_predictions
168
 
 
169
  def generateresponse(history):#, task):
170
  """
171
  Model definition here:
@@ -213,6 +240,7 @@ def generateresponse(history):#, task):
213
  return history
214
 
215
  ############# GRADIO APP ###############
 
216
  theme = gr.themes.Base(
217
  primary_hue="indigo",
218
  ).set(
 
35
  device_map="auto") #low_cpu_mem_usage=True)
36
 
37
  ########## DEFINING FUNCTIONS ###################
38
+
39
  def mean_pooling(model_output, attention_mask):
40
  token_embeddings = model_output[0]
41
  input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
42
  return tf.reduce_sum(token_embeddings * input_mask_expanded, 1) / tf.clip_by_value(input_mask_expanded.sum(1), clip_value_min=1e-9, clip_value_max=math.inf)
43
+
44
  def broad_scope_class_predictor(class_embeddings, abstract_embedding, N=5, Sensitivity='Medium'):
45
  predictions = pd.DataFrame(columns=['Class Name', 'Score'])
46
  for i in range(len(class_embeddings)):
 
70
  HighestSimilarity = predictions.nlargest(N, ['Score'])
71
  return HighestSimilarity
72
 
73
+
74
  def sentence_embedder(sentences, model_path):
75
  tokenizer = AutoTokenizer.from_pretrained(model_path) #instantiating the sentence embedder using HuggingFace library
76
  model = AutoModel.from_pretrained(model_path, from_tf=True) #making a model instance
 
81
  sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']) #outputs a (1, 384) tensor representation of input text
82
  return sentence_embeddings
83
 
84
+
85
  def add_text(history, text):
86
  history = history + [(text, None)]
87
  return history, ""
 
90
  history = history + [((file.name,), None)]
91
  return history
92
 
93
+ def convert_saved_embeddings(embedding_string):
94
+ """
95
+ Preparing pre-computed embeddings for use for comparison with new abstract embeddings .
96
+ Pre-computed embeddings are saved as tensors in string format so need to be converted back to numpy arrays in order to calculate cosine similarity.
97
+ :param embedding_string:
98
+ :return: Should be a single tensor with dims (,384) in string formate
99
+ """
100
+ embedding = embedding_string.replace('(', '')
101
+ embedding = embedding.replace(')', '')
102
+ embedding = embedding.replace('[', '')
103
+ embedding = embedding.replace(']', '')
104
+ embedding = embedding.replace('tensor', '')
105
+ embedding = embedding.replace(' ', '')
106
+ embedding = embedding.split(',')
107
+ embedding = [float(x) for x in embedding]
108
+ embedding = np.array(embedding)
109
+ embedding = np.expand_dims(embedding, axis=0)
110
+ embedding = torch.from_numpy(embedding)
111
+ return embedding
112
+
113
  def bot(history):
114
  response = "**That's cool!**"
115
  history[-1][1] = response
 
117
 
118
 
119
  ########## LOADING PRE-COMPUTED EMBEDDINGS ##########
120
+
121
  def clean_data(input, type='Dataframe'):
122
  if type == 'Dataframe':
123
  cleaneddf = pd.DataFrame(columns=['Class', 'Description'])
 
182
  return row
183
 
184
 
185
+
186
  def classifier(userin):
187
  clean_in = clean_data(userin, type='String')
188
  in_emb = sentence_embedder(clean_in, 'Model_bert')
 
192
 
193
  return broad_scope_predictions
194
 
195
+
196
  def generateresponse(history):#, task):
197
  """
198
  Model definition here:
 
240
  return history
241
 
242
  ############# GRADIO APP ###############
243
+
244
  theme = gr.themes.Base(
245
  primary_hue="indigo",
246
  ).set(