Spaces:

GT4SD
/

PatentToolkit

Runtime error

App Files Files Community

EmicoBinsfinder commited on Apr 10, 2023

Commit

fb737cc

1 Parent(s): 04fbf3e

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -0

app.py CHANGED Viewed

@@ -35,10 +35,12 @@ model = LlamaForCausalLM.from_pretrained(
     device_map="auto") #low_cpu_mem_usage=True)
 ########## DEFINING FUNCTIONS ###################
 def mean_pooling(model_output, attention_mask):
     token_embeddings = model_output[0]
     input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
     return tf.reduce_sum(token_embeddings * input_mask_expanded, 1) / tf.clip_by_value(input_mask_expanded.sum(1), clip_value_min=1e-9, clip_value_max=math.inf)
 def broad_scope_class_predictor(class_embeddings, abstract_embedding, N=5, Sensitivity='Medium'):
     predictions = pd.DataFrame(columns=['Class Name', 'Score'])
     for i in range(len(class_embeddings)):
@@ -68,6 +70,7 @@ def broad_scope_class_predictor(class_embeddings, abstract_embedding, N=5, Sensi
     HighestSimilarity = predictions.nlargest(N, ['Score'])
     return HighestSimilarity
 def sentence_embedder(sentences, model_path):
   tokenizer = AutoTokenizer.from_pretrained(model_path) #instantiating the sentence embedder using HuggingFace library
   model = AutoModel.from_pretrained(model_path, from_tf=True) #making a model instance
@@ -78,6 +81,7 @@ def sentence_embedder(sentences, model_path):
   sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']) #outputs a (1, 384) tensor representation of input text
   return sentence_embeddings
 def add_text(history, text):
     history = history + [(text, None)]
     return history, ""
@@ -86,6 +90,26 @@ def add_file(history, file):
     history = history + [((file.name,), None)]
     return history
 def bot(history):
     response = "**That's cool!**"
     history[-1][1] = response
@@ -93,6 +117,7 @@ def bot(history):
 ########## LOADING PRE-COMPUTED EMBEDDINGS ##########
 def clean_data(input, type='Dataframe'):
     if type == 'Dataframe':
         cleaneddf = pd.DataFrame(columns=['Class', 'Description'])
@@ -157,6 +182,7 @@ def clean_data(input, type='Dataframe'):
         return row
 def classifier(userin):
     clean_in = clean_data(userin, type='String')
     in_emb = sentence_embedder(clean_in, 'Model_bert')
@@ -166,6 +192,7 @@ def classifier(userin):
     return broad_scope_predictions
 def generateresponse(history):#, task):
     """
     Model definition here:
@@ -213,6 +240,7 @@ def generateresponse(history):#, task):
     return history
 ############# GRADIO APP ###############
 theme = gr.themes.Base(
     primary_hue="indigo",
 ).set(

     device_map="auto") #low_cpu_mem_usage=True)
 ########## DEFINING FUNCTIONS ###################
 def mean_pooling(model_output, attention_mask):
     token_embeddings = model_output[0]
     input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
     return tf.reduce_sum(token_embeddings * input_mask_expanded, 1) / tf.clip_by_value(input_mask_expanded.sum(1), clip_value_min=1e-9, clip_value_max=math.inf)
 def broad_scope_class_predictor(class_embeddings, abstract_embedding, N=5, Sensitivity='Medium'):
     predictions = pd.DataFrame(columns=['Class Name', 'Score'])
     for i in range(len(class_embeddings)):
     HighestSimilarity = predictions.nlargest(N, ['Score'])
     return HighestSimilarity
 def sentence_embedder(sentences, model_path):
   tokenizer = AutoTokenizer.from_pretrained(model_path) #instantiating the sentence embedder using HuggingFace library
   model = AutoModel.from_pretrained(model_path, from_tf=True) #making a model instance
   sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']) #outputs a (1, 384) tensor representation of input text
   return sentence_embeddings
 def add_text(history, text):
     history = history + [(text, None)]
     return history, ""
     history = history + [((file.name,), None)]
     return history
+def convert_saved_embeddings(embedding_string):
+    """
+    Preparing pre-computed embeddings for use for comparison with new abstract embeddings .
+    Pre-computed embeddings are saved as tensors in string format so need to be converted back to numpy arrays in order to calculate cosine similarity.
+    :param embedding_string:
+    :return: Should be a single tensor with dims (,384) in string formate
+    """
+    embedding = embedding_string.replace('(', '')
+    embedding = embedding.replace(')', '')
+    embedding = embedding.replace('[', '')
+    embedding = embedding.replace(']', '')
+    embedding = embedding.replace('tensor', '')
+    embedding = embedding.replace(' ', '')
+    embedding = embedding.split(',')
+    embedding = [float(x) for x in embedding]
+    embedding = np.array(embedding)
+    embedding = np.expand_dims(embedding, axis=0)
+    embedding = torch.from_numpy(embedding)
+    return embedding
 def bot(history):
     response = "**That's cool!**"
     history[-1][1] = response
 ########## LOADING PRE-COMPUTED EMBEDDINGS ##########
 def clean_data(input, type='Dataframe'):
     if type == 'Dataframe':
         cleaneddf = pd.DataFrame(columns=['Class', 'Description'])
         return row
 def classifier(userin):
     clean_in = clean_data(userin, type='String')
     in_emb = sentence_embedder(clean_in, 'Model_bert')
     return broad_scope_predictions
 def generateresponse(history):#, task):
     """
     Model definition here:
     return history
 ############# GRADIO APP ###############
 theme = gr.themes.Base(
     primary_hue="indigo",
 ).set(