EmicoBinsfinder commited on
Commit
4515305
·
1 Parent(s): 97632b5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -19
app.py CHANGED
@@ -18,6 +18,9 @@ nltk.download('stopwords')
18
  nltk.download('punkt')
19
  import string
20
 
 
 
 
21
  ########### DATA CLEANER VARIABLES #############
22
  all_stopwords = stopwords.words('english') # Making sure to only use English stopwords
23
  extra_stopwords = ['ii', 'iii'] # Can add extra stopwords to be removed from dataset/input abstracts
@@ -93,20 +96,6 @@ def bot(history):
93
 
94
  ########## LOADING PRE-COMPUTED EMBEDDINGS ##########
95
  def clean_data(input, type='Dataframe'):
96
- """
97
- As preparation for use with the text similarity model, this function removes superfluous data from either a dataframe full of
98
- classifications, or an input string, in order for embeddings to be calculated for them. Removes:
99
- • Entries with missing abstracts/descriptions/classifications/typos
100
- • Duplicate entries
101
- • Unnecessary punctuation
102
- • Stop words (e.g., by, a , an, he, she, it)
103
- • URLs
104
- • All entries are in the same language
105
- :param input: Either a dataframe or an individual string
106
- :param type: Tells fucntion whether input is a dataframe or an individual string
107
- :return: (if dataframe), returns a dataframe containing CPC classfication codes and their associated 'cleaned' description
108
- :return: (if string), returns a 'cleaned' version of the input string
109
- """
110
  if type == 'Dataframe':
111
  cleaneddf = pd.DataFrame(columns=['Class', 'Description'])
112
  for i in range(0, len(input)):
@@ -173,7 +162,7 @@ def clean_data(input, type='Dataframe'):
173
  def classifier(userin):
174
  clean_in = clean_data(userin, type='String')
175
  in_emb = sentence_embedder(clean_in, 'Model_bert')
176
-
177
  Number = 10
178
  broad_scope_predictions = broad_scope_class_predictor(class_embeddings, in_emb, Number, Sensitivity='High')
179
 
@@ -225,9 +214,6 @@ def generateresponse(history):#, task):
225
  print(history)
226
  return history
227
 
228
-
229
-
230
-
231
  ############# GRADIO APP ###############
232
  theme = gr.themes.Base(
233
  primary_hue="indigo",
@@ -240,7 +226,7 @@ with gr.Blocks(title='Claimed', theme=theme) as demo:
240
  gr.Markdown("""
241
  # CLAIMED - A GENERATIVE TOOLKIT FOR PATENT ATTORNEYS
242
 
243
- The patenting process can by incredibly time-consuming and expensive. We're on a mission to change that.
244
 
245
  Welcome to our demo! We've trained Meta's Llama on over 200k entries, with a focus on tasks related to the intellectual property domain.
246
 
 
18
  nltk.download('punkt')
19
  import string
20
 
21
+ ########### Import Classifier Embeddings #########
22
+ class_embeddings = pd.read_csv('Embeddings/MainClassEmbeddings.csv')
23
+
24
  ########### DATA CLEANER VARIABLES #############
25
  all_stopwords = stopwords.words('english') # Making sure to only use English stopwords
26
  extra_stopwords = ['ii', 'iii'] # Can add extra stopwords to be removed from dataset/input abstracts
 
96
 
97
  ########## LOADING PRE-COMPUTED EMBEDDINGS ##########
98
  def clean_data(input, type='Dataframe'):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  if type == 'Dataframe':
100
  cleaneddf = pd.DataFrame(columns=['Class', 'Description'])
101
  for i in range(0, len(input)):
 
162
  def classifier(userin):
163
  clean_in = clean_data(userin, type='String')
164
  in_emb = sentence_embedder(clean_in, 'Model_bert')
165
+
166
  Number = 10
167
  broad_scope_predictions = broad_scope_class_predictor(class_embeddings, in_emb, Number, Sensitivity='High')
168
 
 
214
  print(history)
215
  return history
216
 
 
 
 
217
  ############# GRADIO APP ###############
218
  theme = gr.themes.Base(
219
  primary_hue="indigo",
 
226
  gr.Markdown("""
227
  # CLAIMED - A GENERATIVE TOOLKIT FOR PATENT ATTORNEYS
228
 
229
+ The patenting process can be incredibly time-consuming and expensive. We're on a mission to change that.
230
 
231
  Welcome to our demo! We've trained Meta's Llama on over 200k entries, with a focus on tasks related to the intellectual property domain.
232