mtyrrell commited on
Commit
07660e6
·
1 Parent(s): bcd4037

test ADAPTMIT_TEXT

Browse files
Files changed (2) hide show
  1. app.py +12 -12
  2. modules/utils.py +19 -8
app.py CHANGED
@@ -26,15 +26,15 @@ from io import BytesIO
26
  logger = logging.getLogger(__name__)
27
 
28
  # Local
29
- # from dotenv import load_dotenv
30
- # load_dotenv()
31
 
32
 
33
  # Main app logic
34
  def main():
35
  # Temporarily set authentication to True for testing
36
  if 'authenticated' not in st.session_state:
37
- st.session_state['authenticated'] = False
38
 
39
  if st.session_state['authenticated']:
40
  # Remove login success message for testing
@@ -183,15 +183,15 @@ def main():
183
 
184
 
185
  # Comment out for testing
186
- else:
187
- username = st.text_input("Username")
188
- password = st.text_input("Password", type="password")
189
- if st.button("Login"):
190
- if validate_login(username, password):
191
- st.session_state['authenticated'] = True
192
- st.rerun()
193
- else:
194
- st.error("Incorrect username or password")
195
 
196
 
197
 
 
26
  logger = logging.getLogger(__name__)
27
 
28
  # Local
29
+ from dotenv import load_dotenv
30
+ load_dotenv()
31
 
32
 
33
  # Main app logic
34
  def main():
35
  # Temporarily set authentication to True for testing
36
  if 'authenticated' not in st.session_state:
37
+ st.session_state['authenticated'] = True
38
 
39
  if st.session_state['authenticated']:
40
  # Remove login success message for testing
 
183
 
184
 
185
  # Comment out for testing
186
+ # else:
187
+ # username = st.text_input("Username")
188
+ # password = st.text_input("Password", type="password")
189
+ # if st.button("Login"):
190
+ # if validate_login(username, password):
191
+ # st.session_state['authenticated'] = True
192
+ # st.rerun()
193
+ # else:
194
+ # st.error("Incorrect username or password")
195
 
196
 
197
 
modules/utils.py CHANGED
@@ -83,7 +83,7 @@ def extract_predicted_labels(output, ordinal_selection=1, threshold=0.5):
83
 
84
  # Function to call model and run inference for varying classification tasks/models
85
  def predict_category(df, model_name, progress_bar, repo, profile, multilabel=False):
86
- device = torch.device("cuda") if torch.cuda.is_available() else (torch.device("mps") if torch.has_mps else torch.device("cpu"))
87
  model_names_sf = ['scope_lab1', 'scope_lab2', 'tech_lab1', 'tech_lab3', 'fin_lab2']
88
  if model_name in model_names_sf:
89
  col_name = re.sub(r'_(.*)', r'_txt', model_name)
@@ -91,6 +91,14 @@ def predict_category(df, model_name, progress_bar, repo, profile, multilabel=Fal
91
  model.to(device)
92
  # Get tokenizer from the model
93
  tokenizer = model.model_body.tokenizer
 
 
 
 
 
 
 
 
94
  else:
95
  col_name = 'scope_txt'
96
  model = pipeline("text-classification",
@@ -98,20 +106,20 @@ def predict_category(df, model_name, progress_bar, repo, profile, multilabel=Fal
98
  device=device,
99
  return_all_scores=multilabel,
100
  truncation=True,
101
- max_length=512)
102
  predictions = []
103
  total = len(df)
104
  for i, text in enumerate(df[col_name]):
105
  try:
106
  if model_name in model_names_sf:
107
  # Truncate text for SetFit models
108
- encoded = tokenizer(text, truncation=True, max_length=512)
109
- truncated_text = tokenizer.decode(encoded['input_ids'])
110
- prediction = model(truncated_text)
111
- predictions.append(0 if prediction == 'NEGATIVE' else 1)
112
  else:
113
  prediction = model(text)
114
- if model_name == 'ADAPMIT':
115
  predictions.append(re.sub('Label$', '', prediction[0]['label']))
116
  elif model_name == 'SECTOR':
117
  predictions.append(extract_predicted_labels(prediction[0], threshold=0.5))
@@ -169,7 +177,7 @@ def process_data(uploaded_file, sens_level):
169
 
170
  # Define models and predictions
171
  model_names_sf = ['scope_lab1', 'scope_lab2', 'tech_lab1', 'tech_lab3', 'fin_lab2']
172
- model_names = model_names_sf + ['ADAPMIT', 'SECTOR', 'LANG']
173
  total_predictions = len(model_names) * len(df)
174
  progress_count = 0
175
 
@@ -197,6 +205,7 @@ def process_data(uploaded_file, sens_level):
197
  df[model_name] = predict_category(df, model_name, progress_bar, repo='classifier_SF_' + model_name, profile='mtyrrell')
198
  elif model_name == 'ADAPMIT':
199
  df[model_name] = predict_category(df, model_name, progress_bar, repo='ADAPMIT-multilabel-bge_f', profile='GIZ')
 
200
  elif model_name == 'SECTOR':
201
  sectors_dict = predict_category(df, model_name, progress_bar, repo='SECTOR-multilabel-bge_f', profile='GIZ', multilabel=True)
202
  df['SECTOR1'] = [item['SECTOR1'] for item in sectors_dict]
@@ -204,6 +213,8 @@ def process_data(uploaded_file, sens_level):
204
  elif model_name == 'LANG':
205
  df[model_name] = predict_category(df, model_name, progress_bar, repo='51-languages-classifier', profile='qanastek')
206
  # df[model_name] = predict_category(df, model_name, progress_bar, repo='xlm-roberta-base-language-detection', profile='papluca')
 
 
207
 
208
  logger.info(f"Completed: {model_name}")
209
  model_progress.empty()
 
83
 
84
  # Function to call model and run inference for varying classification tasks/models
85
  def predict_category(df, model_name, progress_bar, repo, profile, multilabel=False):
86
+ device = torch.device("cuda") if torch.cuda.is_available() else (torch.device("mps") if torch.backends.mps.is_built() else torch.device("cpu"))
87
  model_names_sf = ['scope_lab1', 'scope_lab2', 'tech_lab1', 'tech_lab3', 'fin_lab2']
88
  if model_name in model_names_sf:
89
  col_name = re.sub(r'_(.*)', r'_txt', model_name)
 
91
  model.to(device)
92
  # Get tokenizer from the model
93
  tokenizer = model.model_body.tokenizer
94
+ elif model_name == 'ADAPMIT_TECH_TEST':
95
+ col_name = 'tech_txt'
96
+ model = pipeline("text-classification",
97
+ model=profile+"/"+repo,
98
+ device=device,
99
+ return_all_scores=multilabel,
100
+ truncation=True,
101
+ max_length=512)
102
  else:
103
  col_name = 'scope_txt'
104
  model = pipeline("text-classification",
 
106
  device=device,
107
  return_all_scores=multilabel,
108
  truncation=True,
109
+ max_length=512)
110
  predictions = []
111
  total = len(df)
112
  for i, text in enumerate(df[col_name]):
113
  try:
114
  if model_name in model_names_sf:
115
  # Truncate text for SetFit models
116
+ encoded = tokenizer(text, truncation=True, max_length=512)
117
+ truncated_text = tokenizer.decode(encoded['input_ids'])
118
+ prediction = model(truncated_text)
119
+ predictions.append(0 if prediction == 'NEGATIVE' else 1)
120
  else:
121
  prediction = model(text)
122
+ if model_name == 'ADAPMIT' or model_name == 'ADAPMIT_TECH_TEST':
123
  predictions.append(re.sub('Label$', '', prediction[0]['label']))
124
  elif model_name == 'SECTOR':
125
  predictions.append(extract_predicted_labels(prediction[0], threshold=0.5))
 
177
 
178
  # Define models and predictions
179
  model_names_sf = ['scope_lab1', 'scope_lab2', 'tech_lab1', 'tech_lab3', 'fin_lab2']
180
+ model_names = model_names_sf + ['ADAPMIT', 'SECTOR', 'LANG','ADAPMIT_TECH_TEST']
181
  total_predictions = len(model_names) * len(df)
182
  progress_count = 0
183
 
 
205
  df[model_name] = predict_category(df, model_name, progress_bar, repo='classifier_SF_' + model_name, profile='mtyrrell')
206
  elif model_name == 'ADAPMIT':
207
  df[model_name] = predict_category(df, model_name, progress_bar, repo='ADAPMIT-multilabel-bge_f', profile='GIZ')
208
+
209
  elif model_name == 'SECTOR':
210
  sectors_dict = predict_category(df, model_name, progress_bar, repo='SECTOR-multilabel-bge_f', profile='GIZ', multilabel=True)
211
  df['SECTOR1'] = [item['SECTOR1'] for item in sectors_dict]
 
213
  elif model_name == 'LANG':
214
  df[model_name] = predict_category(df, model_name, progress_bar, repo='51-languages-classifier', profile='qanastek')
215
  # df[model_name] = predict_category(df, model_name, progress_bar, repo='xlm-roberta-base-language-detection', profile='papluca')
216
+ elif model_name == 'ADAPMIT_TECH_TEST':
217
+ df[model_name] = predict_category(df, model_name, progress_bar, repo='ADAPMIT-multilabel-bge_f', profile='GIZ')
218
 
219
  logger.info(f"Completed: {model_name}")
220
  model_progress.empty()