Spaces:

GT4SD
/

PatentToolkit

Runtime error

App Files Files Community

EmicoBinsfinder commited on Apr 10, 2023

Commit

4515305

1 Parent(s): 97632b5

Update app.py

Browse files

Files changed (1) hide show

app.py +5 -19

app.py CHANGED Viewed

@@ -18,6 +18,9 @@ nltk.download('stopwords')
 nltk.download('punkt')
 import string
 ########### DATA CLEANER VARIABLES #############
 all_stopwords = stopwords.words('english') # Making sure to only use English stopwords
 extra_stopwords = ['ii', 'iii'] # Can add extra stopwords to be removed from dataset/input abstracts
@@ -93,20 +96,6 @@ def bot(history):
 ########## LOADING PRE-COMPUTED EMBEDDINGS ##########
 def clean_data(input, type='Dataframe'):
-    """
-    As preparation for use with the text similarity model, this function removes superfluous data from either a dataframe full of
-    classifications, or an input string, in order for embeddings to be calculated for them. Removes:
-    •	Entries with missing abstracts/descriptions/classifications/typos
-    •	Duplicate entries
-    •   Unnecessary punctuation
-    •	Stop words (e.g., by, a , an, he, she, it)
-    •  	URLs
-    •	All entries are in the same language
-    :param input: Either a dataframe or an individual string
-    :param type: Tells fucntion whether input is a dataframe or an individual string
-    :return: (if dataframe), returns a dataframe containing CPC classfication codes and their associated 'cleaned' description
-    :return:  (if string), returns a 'cleaned' version of the input string
-    """
     if type == 'Dataframe':
         cleaneddf = pd.DataFrame(columns=['Class', 'Description'])
         for i in range(0, len(input)):
@@ -173,7 +162,7 @@ def clean_data(input, type='Dataframe'):
 def classifier(userin):
     clean_in = clean_data(userin, type='String')
     in_emb = sentence_embedder(clean_in, 'Model_bert')
     Number = 10
     broad_scope_predictions = broad_scope_class_predictor(class_embeddings, in_emb, Number, Sensitivity='High')
@@ -225,9 +214,6 @@ def generateresponse(history):#, task):
     print(history)
     return history
 ############# GRADIO APP ###############
 theme = gr.themes.Base(
     primary_hue="indigo",
@@ -240,7 +226,7 @@ with gr.Blocks(title='Claimed', theme=theme) as demo:
     gr.Markdown("""
     # CLAIMED - A GENERATIVE TOOLKIT FOR PATENT ATTORNEYS
-    The patenting process can by incredibly time-consuming and expensive. We're on a mission to change that.
     Welcome to our demo! We've trained Meta's Llama on over 200k entries, with a focus on tasks related to the intellectual property domain.

 nltk.download('punkt')
 import string
+########### Import Classifier Embeddings #########
+class_embeddings = pd.read_csv('Embeddings/MainClassEmbeddings.csv')
 ########### DATA CLEANER VARIABLES #############
 all_stopwords = stopwords.words('english') # Making sure to only use English stopwords
 extra_stopwords = ['ii', 'iii'] # Can add extra stopwords to be removed from dataset/input abstracts
 ########## LOADING PRE-COMPUTED EMBEDDINGS ##########
 def clean_data(input, type='Dataframe'):
     if type == 'Dataframe':
         cleaneddf = pd.DataFrame(columns=['Class', 'Description'])
         for i in range(0, len(input)):
 def classifier(userin):
     clean_in = clean_data(userin, type='String')
     in_emb = sentence_embedder(clean_in, 'Model_bert')
     Number = 10
     broad_scope_predictions = broad_scope_class_predictor(class_embeddings, in_emb, Number, Sensitivity='High')
     print(history)
     return history
 ############# GRADIO APP ###############
 theme = gr.themes.Base(
     primary_hue="indigo",
     gr.Markdown("""
     # CLAIMED - A GENERATIVE TOOLKIT FOR PATENT ATTORNEYS
+    The patenting process can be incredibly time-consuming and expensive. We're on a mission to change that.
     Welcome to our demo! We've trained Meta's Llama on over 200k entries, with a focus on tasks related to the intellectual property domain.