mtyrrell commited on
Commit
49c8ea7
·
1 Parent(s): 76de63c

adapted logic for ADAPTMIT (consensus on tech + scope for ineligiibltiy)

Browse files
Files changed (2) hide show
  1. app.py +12 -12
  2. modules/utils.py +55 -22
app.py CHANGED
@@ -26,15 +26,15 @@ from io import BytesIO
26
  logger = logging.getLogger(__name__)
27
 
28
  # Local
29
- # from dotenv import load_dotenv
30
- # load_dotenv()
31
 
32
 
33
  # Main app logic
34
  def main():
35
  # Temporarily set authentication to True for testing
36
  if 'authenticated' not in st.session_state:
37
- st.session_state['authenticated'] = False
38
 
39
  if st.session_state['authenticated']:
40
  # Remove login success message for testing
@@ -183,15 +183,15 @@ def main():
183
 
184
 
185
  # Comment out for testing
186
- else:
187
- username = st.text_input("Username")
188
- password = st.text_input("Password", type="password")
189
- if st.button("Login"):
190
- if validate_login(username, password):
191
- st.session_state['authenticated'] = True
192
- st.rerun()
193
- else:
194
- st.error("Incorrect username or password")
195
 
196
 
197
 
 
26
  logger = logging.getLogger(__name__)
27
 
28
  # Local
29
+ from dotenv import load_dotenv
30
+ load_dotenv()
31
 
32
 
33
  # Main app logic
34
  def main():
35
  # Temporarily set authentication to True for testing
36
  if 'authenticated' not in st.session_state:
37
+ st.session_state['authenticated'] = True
38
 
39
  if st.session_state['authenticated']:
40
  # Remove login success message for testing
 
183
 
184
 
185
  # Comment out for testing
186
+ # else:
187
+ # username = st.text_input("Username")
188
+ # password = st.text_input("Password", type="password")
189
+ # if st.button("Login"):
190
+ # if validate_login(username, password):
191
+ # st.session_state['authenticated'] = True
192
+ # st.rerun()
193
+ # else:
194
+ # st.error("Incorrect username or password")
195
 
196
 
197
 
modules/utils.py CHANGED
@@ -85,29 +85,36 @@ def extract_predicted_labels(output, ordinal_selection=1, threshold=0.5):
85
  def predict_category(df, model_name, progress_bar, repo, profile, multilabel=False):
86
  device = torch.device("cuda") if torch.cuda.is_available() else (torch.device("mps") if torch.backends.mps.is_built() else torch.device("cpu"))
87
  model_names_sf = ['scope_lab1', 'scope_lab2', 'tech_lab1', 'tech_lab3', 'fin_lab2']
 
 
 
 
 
 
 
 
 
88
  if model_name in model_names_sf:
89
  col_name = re.sub(r'_(.*)', r'_txt', model_name)
90
  model = SetFitModel.from_pretrained(profile+"/"+repo)
91
  model.to(device)
92
  # Get tokenizer from the model
93
  tokenizer = model.model_body.tokenizer
94
- elif model_name == 'ADAPMIT_TECH_TEST':
95
- col_name = 'tech_txt'
96
- model = pipeline("text-classification",
97
- model=profile+"/"+repo,
98
- device=device,
99
- return_all_scores=multilabel,
100
- truncation=True,
101
- max_length=512)
102
  else:
103
- col_name = 'scope_txt'
104
- model = pipeline("text-classification",
105
- model=profile+"/"+repo,
106
- device=device,
107
- return_all_scores=multilabel,
 
 
108
  truncation=True,
109
  max_length=512)
 
 
 
110
  predictions = []
 
111
  total = len(df)
112
  for i, text in enumerate(df[col_name]):
113
  try:
@@ -119,18 +126,19 @@ def predict_category(df, model_name, progress_bar, repo, profile, multilabel=Fal
119
  predictions.append(0 if prediction == 'NEGATIVE' else 1)
120
  else:
121
  prediction = model(text)
122
- if model_name == 'ADAPMIT' or model_name == 'ADAPMIT_TECH_TEST':
123
- predictions.append(re.sub('Label$', '', prediction[0]['label']))
124
  elif model_name == 'SECTOR':
125
  predictions.append(extract_predicted_labels(prediction[0], threshold=0.5))
126
  elif model_name == 'LANG':
127
- predictions.append(prediction[0]['label'])
128
  except Exception as e:
129
  logger.error(f"Error processing sample {df['id'][i]}: {str(e)}")
130
  st.error("Application Error. Please contact support.")
131
  # Update progress bar with each iteration
132
  progress = (i + 1) / total
133
- progress_bar.progress(progress)
 
134
  return predictions
135
 
136
 
@@ -177,7 +185,9 @@ def process_data(uploaded_file, sens_level):
177
 
178
  # Define models and predictions
179
  model_names_sf = ['scope_lab1', 'scope_lab2', 'tech_lab1', 'tech_lab3', 'fin_lab2']
180
- model_names = model_names_sf + ['ADAPMIT', 'SECTOR', 'LANG','ADAPMIT_TECH_TEST']
 
 
181
  total_predictions = len(model_names) * len(df)
182
  progress_count = 0
183
 
@@ -203,9 +213,10 @@ def process_data(uploaded_file, sens_level):
203
  # Load the model and run inference
204
  if model_name in model_names_sf:
205
  df[model_name] = predict_category(df, model_name, progress_bar, repo='classifier_SF_' + model_name, profile='mtyrrell')
206
- elif model_name == 'ADAPMIT':
207
  df[model_name] = predict_category(df, model_name, progress_bar, repo='ADAPMIT-multilabel-bge_f', profile='GIZ')
208
-
 
209
  elif model_name == 'SECTOR':
210
  sectors_dict = predict_category(df, model_name, progress_bar, repo='SECTOR-multilabel-bge_f', profile='GIZ', multilabel=True)
211
  df['SECTOR1'] = [item['SECTOR1'] for item in sectors_dict]
@@ -213,8 +224,7 @@ def process_data(uploaded_file, sens_level):
213
  elif model_name == 'LANG':
214
  df[model_name] = predict_category(df, model_name, progress_bar, repo='51-languages-classifier', profile='qanastek')
215
  # df[model_name] = predict_category(df, model_name, progress_bar, repo='xlm-roberta-base-language-detection', profile='papluca')
216
- elif model_name == 'ADAPMIT_TECH_TEST':
217
- df[model_name] = predict_category(df, model_name, progress_bar, repo='ADAPMIT-multilabel-bge_f', profile='GIZ')
218
 
219
  logger.info(f"Completed: {model_name}")
220
  model_progress.empty()
@@ -243,6 +253,21 @@ def process_data(uploaded_file, sens_level):
243
 
244
  st.write(f'Processing complete. Total time: {elapsed_time:.1f} seconds')
245
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
  # Convert funding columns to numeric, replacing any non-numeric values with NaN
247
  df['maf_funding'] = pd.to_numeric(df['maf_funding'], errors='coerce')
248
  df['cont_public'] = pd.to_numeric(df['cont_public'], errors='coerce')
@@ -284,6 +309,14 @@ def process_data(uploaded_file, sens_level):
284
  else 'PRE-ASSESSMENT' if sens_level+1 <= x['pred_score'] <= sens_level+2
285
  else 'FULL-ASSESSMENT' if x['pred_score'] > sens_level+2
286
  else 'ERROR', axis=1)
 
 
 
 
 
 
 
 
287
 
288
  return df
289
 
 
85
  def predict_category(df, model_name, progress_bar, repo, profile, multilabel=False):
86
  device = torch.device("cuda") if torch.cuda.is_available() else (torch.device("mps") if torch.backends.mps.is_built() else torch.device("cpu"))
87
  model_names_sf = ['scope_lab1', 'scope_lab2', 'tech_lab1', 'tech_lab3', 'fin_lab2']
88
+
89
+ # Model configuration mapping
90
+ model_config = {
91
+ 'ADAPMIT_TECH': {'col_name': 'tech_txt', 'top_k': 1},
92
+ 'ADAPMIT_SCOPE': {'col_name': 'scope_txt', 'top_k': 1},
93
+ 'LANG': {'col_name': 'scope_txt', 'top_k': 1},
94
+ 'default': {'col_name': 'scope_txt', 'top_k': None}
95
+ }
96
+
97
  if model_name in model_names_sf:
98
  col_name = re.sub(r'_(.*)', r'_txt', model_name)
99
  model = SetFitModel.from_pretrained(profile+"/"+repo)
100
  model.to(device)
101
  # Get tokenizer from the model
102
  tokenizer = model.model_body.tokenizer
 
 
 
 
 
 
 
 
103
  else:
104
+ # Get configuration for the model, falling back to default if not specified
105
+ config = model_config.get(model_name, model_config['default'])
106
+ col_name = config['col_name']
107
+ model = pipeline("text-classification",
108
+ model=profile+"/"+repo,
109
+ device=device,
110
+ top_k=config['top_k'],
111
  truncation=True,
112
  max_length=512)
113
+
114
+
115
+
116
  predictions = []
117
+ # probabilities = []
118
  total = len(df)
119
  for i, text in enumerate(df[col_name]):
120
  try:
 
126
  predictions.append(0 if prediction == 'NEGATIVE' else 1)
127
  else:
128
  prediction = model(text)
129
+ if model_name == 'ADAPMIT_SCOPE' or model_name == 'ADAPMIT_TECH':
130
+ predictions.append(re.sub('Label$', '', prediction[0][0]['label']))
131
  elif model_name == 'SECTOR':
132
  predictions.append(extract_predicted_labels(prediction[0], threshold=0.5))
133
  elif model_name == 'LANG':
134
+ predictions.append(prediction[0][0]['label'])
135
  except Exception as e:
136
  logger.error(f"Error processing sample {df['id'][i]}: {str(e)}")
137
  st.error("Application Error. Please contact support.")
138
  # Update progress bar with each iteration
139
  progress = (i + 1) / total
140
+ progress_bar.progress(progress)
141
+
142
  return predictions
143
 
144
 
 
185
 
186
  # Define models and predictions
187
  model_names_sf = ['scope_lab1', 'scope_lab2', 'tech_lab1', 'tech_lab3', 'fin_lab2']
188
+ model_names = model_names_sf + ['ADAPMIT_SCOPE','ADAPMIT_TECH','SECTOR','LANG']
189
+ # model_names_sf = []
190
+ # model_names = ['ADAPMIT_SCOPE','ADAPMIT_TECH']
191
  total_predictions = len(model_names) * len(df)
192
  progress_count = 0
193
 
 
213
  # Load the model and run inference
214
  if model_name in model_names_sf:
215
  df[model_name] = predict_category(df, model_name, progress_bar, repo='classifier_SF_' + model_name, profile='mtyrrell')
216
+ elif model_name == 'ADAPMIT_SCOPE':
217
  df[model_name] = predict_category(df, model_name, progress_bar, repo='ADAPMIT-multilabel-bge_f', profile='GIZ')
218
+ elif model_name == 'ADAPMIT_TECH':
219
+ df[model_name]= predict_category(df, model_name, progress_bar, repo='ADAPMIT-multilabel-bge_f', profile='GIZ')
220
  elif model_name == 'SECTOR':
221
  sectors_dict = predict_category(df, model_name, progress_bar, repo='SECTOR-multilabel-bge_f', profile='GIZ', multilabel=True)
222
  df['SECTOR1'] = [item['SECTOR1'] for item in sectors_dict]
 
224
  elif model_name == 'LANG':
225
  df[model_name] = predict_category(df, model_name, progress_bar, repo='51-languages-classifier', profile='qanastek')
226
  # df[model_name] = predict_category(df, model_name, progress_bar, repo='xlm-roberta-base-language-detection', profile='papluca')
227
+
 
228
 
229
  logger.info(f"Completed: {model_name}")
230
  model_progress.empty()
 
253
 
254
  st.write(f'Processing complete. Total time: {elapsed_time:.1f} seconds')
255
 
256
+
257
+ # df['ADAPMIT_SCOPE_SCORE'] = df['ADAPMIT_SCOPE'].apply(
258
+ # lambda x: next((item['score'] for item in x if item['label'] == 'MitigationLabel'), 0)
259
+ # )
260
+ # df['ADAPMIT_TECH_SCORE'] = df['ADAPMIT_TECH'].apply(
261
+ # lambda x: next((item['score'] for item in x if item['label'] == 'MitigationLabel'), 0)
262
+ # )
263
+
264
+ # # Calculate average mitigation score
265
+ # df['ADAPMIT_SCORE'] = (df['ADAPMIT_SCOPE_SCORE'] + df['ADAPMIT_TECH_SCORE']) / 2
266
+
267
+ df['ADAPMIT'] = df.apply(lambda x: 'Adaptation' if x['ADAPMIT_SCOPE'] == 'Adaptation' and x['ADAPMIT_TECH'] == 'Adaptation' else 'Mitigation', axis=1)
268
+
269
+
270
+
271
  # Convert funding columns to numeric, replacing any non-numeric values with NaN
272
  df['maf_funding'] = pd.to_numeric(df['maf_funding'], errors='coerce')
273
  df['cont_public'] = pd.to_numeric(df['cont_public'], errors='coerce')
 
309
  else 'PRE-ASSESSMENT' if sens_level+1 <= x['pred_score'] <= sens_level+2
310
  else 'FULL-ASSESSMENT' if x['pred_score'] > sens_level+2
311
  else 'ERROR', axis=1)
312
+
313
+ # Reorder columns in final dataframe
314
+ column_order = ['id', 'scope_txt', 'tech_txt', 'fin_txt', 'maf_funding', 'cont_public',
315
+ 'cont_private', 'cont_other', 'scope_lab1', 'scope_lab2', 'tech_lab1',
316
+ 'tech_lab3', 'fin_lab2', 'ADAPMIT_SCOPE', 'ADAPMIT_TECH', 'ADAPMIT', 'SECTOR1',
317
+ 'SECTOR2', 'LANG', 'lev_total', 'lev_gt_0', 'lev_maf_%', 'lev_maf_scale',
318
+ 'word_length_check', 'pred_score', 'pred_action']
319
+ df = df[column_order]
320
 
321
  return df
322