Spaces:
Running
Running
adapted logic for ADAPTMIT (consensus on tech + scope for ineligiibltiy)
Browse files- app.py +12 -12
- modules/utils.py +55 -22
app.py
CHANGED
@@ -26,15 +26,15 @@ from io import BytesIO
|
|
26 |
logger = logging.getLogger(__name__)
|
27 |
|
28 |
# Local
|
29 |
-
|
30 |
-
|
31 |
|
32 |
|
33 |
# Main app logic
|
34 |
def main():
|
35 |
# Temporarily set authentication to True for testing
|
36 |
if 'authenticated' not in st.session_state:
|
37 |
-
st.session_state['authenticated'] =
|
38 |
|
39 |
if st.session_state['authenticated']:
|
40 |
# Remove login success message for testing
|
@@ -183,15 +183,15 @@ def main():
|
|
183 |
|
184 |
|
185 |
# Comment out for testing
|
186 |
-
else:
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
|
196 |
|
197 |
|
|
|
26 |
logger = logging.getLogger(__name__)
|
27 |
|
28 |
# Local
|
29 |
+
from dotenv import load_dotenv
|
30 |
+
load_dotenv()
|
31 |
|
32 |
|
33 |
# Main app logic
|
34 |
def main():
|
35 |
# Temporarily set authentication to True for testing
|
36 |
if 'authenticated' not in st.session_state:
|
37 |
+
st.session_state['authenticated'] = True
|
38 |
|
39 |
if st.session_state['authenticated']:
|
40 |
# Remove login success message for testing
|
|
|
183 |
|
184 |
|
185 |
# Comment out for testing
|
186 |
+
# else:
|
187 |
+
# username = st.text_input("Username")
|
188 |
+
# password = st.text_input("Password", type="password")
|
189 |
+
# if st.button("Login"):
|
190 |
+
# if validate_login(username, password):
|
191 |
+
# st.session_state['authenticated'] = True
|
192 |
+
# st.rerun()
|
193 |
+
# else:
|
194 |
+
# st.error("Incorrect username or password")
|
195 |
|
196 |
|
197 |
|
modules/utils.py
CHANGED
@@ -85,29 +85,36 @@ def extract_predicted_labels(output, ordinal_selection=1, threshold=0.5):
|
|
85 |
def predict_category(df, model_name, progress_bar, repo, profile, multilabel=False):
|
86 |
device = torch.device("cuda") if torch.cuda.is_available() else (torch.device("mps") if torch.backends.mps.is_built() else torch.device("cpu"))
|
87 |
model_names_sf = ['scope_lab1', 'scope_lab2', 'tech_lab1', 'tech_lab3', 'fin_lab2']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
if model_name in model_names_sf:
|
89 |
col_name = re.sub(r'_(.*)', r'_txt', model_name)
|
90 |
model = SetFitModel.from_pretrained(profile+"/"+repo)
|
91 |
model.to(device)
|
92 |
# Get tokenizer from the model
|
93 |
tokenizer = model.model_body.tokenizer
|
94 |
-
elif model_name == 'ADAPMIT_TECH_TEST':
|
95 |
-
col_name = 'tech_txt'
|
96 |
-
model = pipeline("text-classification",
|
97 |
-
model=profile+"/"+repo,
|
98 |
-
device=device,
|
99 |
-
return_all_scores=multilabel,
|
100 |
-
truncation=True,
|
101 |
-
max_length=512)
|
102 |
else:
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
|
|
|
|
108 |
truncation=True,
|
109 |
max_length=512)
|
|
|
|
|
|
|
110 |
predictions = []
|
|
|
111 |
total = len(df)
|
112 |
for i, text in enumerate(df[col_name]):
|
113 |
try:
|
@@ -119,18 +126,19 @@ def predict_category(df, model_name, progress_bar, repo, profile, multilabel=Fal
|
|
119 |
predictions.append(0 if prediction == 'NEGATIVE' else 1)
|
120 |
else:
|
121 |
prediction = model(text)
|
122 |
-
if model_name == '
|
123 |
-
predictions.append(re.sub('Label$', '', prediction[0]['label']))
|
124 |
elif model_name == 'SECTOR':
|
125 |
predictions.append(extract_predicted_labels(prediction[0], threshold=0.5))
|
126 |
elif model_name == 'LANG':
|
127 |
-
predictions.append(prediction[0]['label'])
|
128 |
except Exception as e:
|
129 |
logger.error(f"Error processing sample {df['id'][i]}: {str(e)}")
|
130 |
st.error("Application Error. Please contact support.")
|
131 |
# Update progress bar with each iteration
|
132 |
progress = (i + 1) / total
|
133 |
-
progress_bar.progress(progress)
|
|
|
134 |
return predictions
|
135 |
|
136 |
|
@@ -177,7 +185,9 @@ def process_data(uploaded_file, sens_level):
|
|
177 |
|
178 |
# Define models and predictions
|
179 |
model_names_sf = ['scope_lab1', 'scope_lab2', 'tech_lab1', 'tech_lab3', 'fin_lab2']
|
180 |
-
model_names = model_names_sf + ['
|
|
|
|
|
181 |
total_predictions = len(model_names) * len(df)
|
182 |
progress_count = 0
|
183 |
|
@@ -203,9 +213,10 @@ def process_data(uploaded_file, sens_level):
|
|
203 |
# Load the model and run inference
|
204 |
if model_name in model_names_sf:
|
205 |
df[model_name] = predict_category(df, model_name, progress_bar, repo='classifier_SF_' + model_name, profile='mtyrrell')
|
206 |
-
elif model_name == '
|
207 |
df[model_name] = predict_category(df, model_name, progress_bar, repo='ADAPMIT-multilabel-bge_f', profile='GIZ')
|
208 |
-
|
|
|
209 |
elif model_name == 'SECTOR':
|
210 |
sectors_dict = predict_category(df, model_name, progress_bar, repo='SECTOR-multilabel-bge_f', profile='GIZ', multilabel=True)
|
211 |
df['SECTOR1'] = [item['SECTOR1'] for item in sectors_dict]
|
@@ -213,8 +224,7 @@ def process_data(uploaded_file, sens_level):
|
|
213 |
elif model_name == 'LANG':
|
214 |
df[model_name] = predict_category(df, model_name, progress_bar, repo='51-languages-classifier', profile='qanastek')
|
215 |
# df[model_name] = predict_category(df, model_name, progress_bar, repo='xlm-roberta-base-language-detection', profile='papluca')
|
216 |
-
|
217 |
-
df[model_name] = predict_category(df, model_name, progress_bar, repo='ADAPMIT-multilabel-bge_f', profile='GIZ')
|
218 |
|
219 |
logger.info(f"Completed: {model_name}")
|
220 |
model_progress.empty()
|
@@ -243,6 +253,21 @@ def process_data(uploaded_file, sens_level):
|
|
243 |
|
244 |
st.write(f'Processing complete. Total time: {elapsed_time:.1f} seconds')
|
245 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
246 |
# Convert funding columns to numeric, replacing any non-numeric values with NaN
|
247 |
df['maf_funding'] = pd.to_numeric(df['maf_funding'], errors='coerce')
|
248 |
df['cont_public'] = pd.to_numeric(df['cont_public'], errors='coerce')
|
@@ -284,6 +309,14 @@ def process_data(uploaded_file, sens_level):
|
|
284 |
else 'PRE-ASSESSMENT' if sens_level+1 <= x['pred_score'] <= sens_level+2
|
285 |
else 'FULL-ASSESSMENT' if x['pred_score'] > sens_level+2
|
286 |
else 'ERROR', axis=1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
287 |
|
288 |
return df
|
289 |
|
|
|
85 |
def predict_category(df, model_name, progress_bar, repo, profile, multilabel=False):
|
86 |
device = torch.device("cuda") if torch.cuda.is_available() else (torch.device("mps") if torch.backends.mps.is_built() else torch.device("cpu"))
|
87 |
model_names_sf = ['scope_lab1', 'scope_lab2', 'tech_lab1', 'tech_lab3', 'fin_lab2']
|
88 |
+
|
89 |
+
# Model configuration mapping
|
90 |
+
model_config = {
|
91 |
+
'ADAPMIT_TECH': {'col_name': 'tech_txt', 'top_k': 1},
|
92 |
+
'ADAPMIT_SCOPE': {'col_name': 'scope_txt', 'top_k': 1},
|
93 |
+
'LANG': {'col_name': 'scope_txt', 'top_k': 1},
|
94 |
+
'default': {'col_name': 'scope_txt', 'top_k': None}
|
95 |
+
}
|
96 |
+
|
97 |
if model_name in model_names_sf:
|
98 |
col_name = re.sub(r'_(.*)', r'_txt', model_name)
|
99 |
model = SetFitModel.from_pretrained(profile+"/"+repo)
|
100 |
model.to(device)
|
101 |
# Get tokenizer from the model
|
102 |
tokenizer = model.model_body.tokenizer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
else:
|
104 |
+
# Get configuration for the model, falling back to default if not specified
|
105 |
+
config = model_config.get(model_name, model_config['default'])
|
106 |
+
col_name = config['col_name']
|
107 |
+
model = pipeline("text-classification",
|
108 |
+
model=profile+"/"+repo,
|
109 |
+
device=device,
|
110 |
+
top_k=config['top_k'],
|
111 |
truncation=True,
|
112 |
max_length=512)
|
113 |
+
|
114 |
+
|
115 |
+
|
116 |
predictions = []
|
117 |
+
# probabilities = []
|
118 |
total = len(df)
|
119 |
for i, text in enumerate(df[col_name]):
|
120 |
try:
|
|
|
126 |
predictions.append(0 if prediction == 'NEGATIVE' else 1)
|
127 |
else:
|
128 |
prediction = model(text)
|
129 |
+
if model_name == 'ADAPMIT_SCOPE' or model_name == 'ADAPMIT_TECH':
|
130 |
+
predictions.append(re.sub('Label$', '', prediction[0][0]['label']))
|
131 |
elif model_name == 'SECTOR':
|
132 |
predictions.append(extract_predicted_labels(prediction[0], threshold=0.5))
|
133 |
elif model_name == 'LANG':
|
134 |
+
predictions.append(prediction[0][0]['label'])
|
135 |
except Exception as e:
|
136 |
logger.error(f"Error processing sample {df['id'][i]}: {str(e)}")
|
137 |
st.error("Application Error. Please contact support.")
|
138 |
# Update progress bar with each iteration
|
139 |
progress = (i + 1) / total
|
140 |
+
progress_bar.progress(progress)
|
141 |
+
|
142 |
return predictions
|
143 |
|
144 |
|
|
|
185 |
|
186 |
# Define models and predictions
|
187 |
model_names_sf = ['scope_lab1', 'scope_lab2', 'tech_lab1', 'tech_lab3', 'fin_lab2']
|
188 |
+
model_names = model_names_sf + ['ADAPMIT_SCOPE','ADAPMIT_TECH','SECTOR','LANG']
|
189 |
+
# model_names_sf = []
|
190 |
+
# model_names = ['ADAPMIT_SCOPE','ADAPMIT_TECH']
|
191 |
total_predictions = len(model_names) * len(df)
|
192 |
progress_count = 0
|
193 |
|
|
|
213 |
# Load the model and run inference
|
214 |
if model_name in model_names_sf:
|
215 |
df[model_name] = predict_category(df, model_name, progress_bar, repo='classifier_SF_' + model_name, profile='mtyrrell')
|
216 |
+
elif model_name == 'ADAPMIT_SCOPE':
|
217 |
df[model_name] = predict_category(df, model_name, progress_bar, repo='ADAPMIT-multilabel-bge_f', profile='GIZ')
|
218 |
+
elif model_name == 'ADAPMIT_TECH':
|
219 |
+
df[model_name]= predict_category(df, model_name, progress_bar, repo='ADAPMIT-multilabel-bge_f', profile='GIZ')
|
220 |
elif model_name == 'SECTOR':
|
221 |
sectors_dict = predict_category(df, model_name, progress_bar, repo='SECTOR-multilabel-bge_f', profile='GIZ', multilabel=True)
|
222 |
df['SECTOR1'] = [item['SECTOR1'] for item in sectors_dict]
|
|
|
224 |
elif model_name == 'LANG':
|
225 |
df[model_name] = predict_category(df, model_name, progress_bar, repo='51-languages-classifier', profile='qanastek')
|
226 |
# df[model_name] = predict_category(df, model_name, progress_bar, repo='xlm-roberta-base-language-detection', profile='papluca')
|
227 |
+
|
|
|
228 |
|
229 |
logger.info(f"Completed: {model_name}")
|
230 |
model_progress.empty()
|
|
|
253 |
|
254 |
st.write(f'Processing complete. Total time: {elapsed_time:.1f} seconds')
|
255 |
|
256 |
+
|
257 |
+
# df['ADAPMIT_SCOPE_SCORE'] = df['ADAPMIT_SCOPE'].apply(
|
258 |
+
# lambda x: next((item['score'] for item in x if item['label'] == 'MitigationLabel'), 0)
|
259 |
+
# )
|
260 |
+
# df['ADAPMIT_TECH_SCORE'] = df['ADAPMIT_TECH'].apply(
|
261 |
+
# lambda x: next((item['score'] for item in x if item['label'] == 'MitigationLabel'), 0)
|
262 |
+
# )
|
263 |
+
|
264 |
+
# # Calculate average mitigation score
|
265 |
+
# df['ADAPMIT_SCORE'] = (df['ADAPMIT_SCOPE_SCORE'] + df['ADAPMIT_TECH_SCORE']) / 2
|
266 |
+
|
267 |
+
df['ADAPMIT'] = df.apply(lambda x: 'Adaptation' if x['ADAPMIT_SCOPE'] == 'Adaptation' and x['ADAPMIT_TECH'] == 'Adaptation' else 'Mitigation', axis=1)
|
268 |
+
|
269 |
+
|
270 |
+
|
271 |
# Convert funding columns to numeric, replacing any non-numeric values with NaN
|
272 |
df['maf_funding'] = pd.to_numeric(df['maf_funding'], errors='coerce')
|
273 |
df['cont_public'] = pd.to_numeric(df['cont_public'], errors='coerce')
|
|
|
309 |
else 'PRE-ASSESSMENT' if sens_level+1 <= x['pred_score'] <= sens_level+2
|
310 |
else 'FULL-ASSESSMENT' if x['pred_score'] > sens_level+2
|
311 |
else 'ERROR', axis=1)
|
312 |
+
|
313 |
+
# Reorder columns in final dataframe
|
314 |
+
column_order = ['id', 'scope_txt', 'tech_txt', 'fin_txt', 'maf_funding', 'cont_public',
|
315 |
+
'cont_private', 'cont_other', 'scope_lab1', 'scope_lab2', 'tech_lab1',
|
316 |
+
'tech_lab3', 'fin_lab2', 'ADAPMIT_SCOPE', 'ADAPMIT_TECH', 'ADAPMIT', 'SECTOR1',
|
317 |
+
'SECTOR2', 'LANG', 'lev_total', 'lev_gt_0', 'lev_maf_%', 'lev_maf_scale',
|
318 |
+
'word_length_check', 'pred_score', 'pred_action']
|
319 |
+
df = df[column_order]
|
320 |
|
321 |
return df
|
322 |
|