Spaces:
Running
Running
test ADAPTMIT_TEXT
Browse files- app.py +12 -12
- modules/utils.py +19 -8
app.py
CHANGED
@@ -26,15 +26,15 @@ from io import BytesIO
|
|
26 |
logger = logging.getLogger(__name__)
|
27 |
|
28 |
# Local
|
29 |
-
|
30 |
-
|
31 |
|
32 |
|
33 |
# Main app logic
|
34 |
def main():
|
35 |
# Temporarily set authentication to True for testing
|
36 |
if 'authenticated' not in st.session_state:
|
37 |
-
st.session_state['authenticated'] =
|
38 |
|
39 |
if st.session_state['authenticated']:
|
40 |
# Remove login success message for testing
|
@@ -183,15 +183,15 @@ def main():
|
|
183 |
|
184 |
|
185 |
# Comment out for testing
|
186 |
-
else:
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
|
196 |
|
197 |
|
|
|
26 |
logger = logging.getLogger(__name__)
|
27 |
|
28 |
# Local
|
29 |
+
from dotenv import load_dotenv
|
30 |
+
load_dotenv()
|
31 |
|
32 |
|
33 |
# Main app logic
|
34 |
def main():
|
35 |
# Temporarily set authentication to True for testing
|
36 |
if 'authenticated' not in st.session_state:
|
37 |
+
st.session_state['authenticated'] = True
|
38 |
|
39 |
if st.session_state['authenticated']:
|
40 |
# Remove login success message for testing
|
|
|
183 |
|
184 |
|
185 |
# Comment out for testing
|
186 |
+
# else:
|
187 |
+
# username = st.text_input("Username")
|
188 |
+
# password = st.text_input("Password", type="password")
|
189 |
+
# if st.button("Login"):
|
190 |
+
# if validate_login(username, password):
|
191 |
+
# st.session_state['authenticated'] = True
|
192 |
+
# st.rerun()
|
193 |
+
# else:
|
194 |
+
# st.error("Incorrect username or password")
|
195 |
|
196 |
|
197 |
|
modules/utils.py
CHANGED
@@ -83,7 +83,7 @@ def extract_predicted_labels(output, ordinal_selection=1, threshold=0.5):
|
|
83 |
|
84 |
# Function to call model and run inference for varying classification tasks/models
|
85 |
def predict_category(df, model_name, progress_bar, repo, profile, multilabel=False):
|
86 |
-
device = torch.device("cuda") if torch.cuda.is_available() else (torch.device("mps") if torch.
|
87 |
model_names_sf = ['scope_lab1', 'scope_lab2', 'tech_lab1', 'tech_lab3', 'fin_lab2']
|
88 |
if model_name in model_names_sf:
|
89 |
col_name = re.sub(r'_(.*)', r'_txt', model_name)
|
@@ -91,6 +91,14 @@ def predict_category(df, model_name, progress_bar, repo, profile, multilabel=Fal
|
|
91 |
model.to(device)
|
92 |
# Get tokenizer from the model
|
93 |
tokenizer = model.model_body.tokenizer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
else:
|
95 |
col_name = 'scope_txt'
|
96 |
model = pipeline("text-classification",
|
@@ -98,20 +106,20 @@ def predict_category(df, model_name, progress_bar, repo, profile, multilabel=Fal
|
|
98 |
device=device,
|
99 |
return_all_scores=multilabel,
|
100 |
truncation=True,
|
101 |
-
max_length=512)
|
102 |
predictions = []
|
103 |
total = len(df)
|
104 |
for i, text in enumerate(df[col_name]):
|
105 |
try:
|
106 |
if model_name in model_names_sf:
|
107 |
# Truncate text for SetFit models
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
else:
|
113 |
prediction = model(text)
|
114 |
-
if model_name == 'ADAPMIT':
|
115 |
predictions.append(re.sub('Label$', '', prediction[0]['label']))
|
116 |
elif model_name == 'SECTOR':
|
117 |
predictions.append(extract_predicted_labels(prediction[0], threshold=0.5))
|
@@ -169,7 +177,7 @@ def process_data(uploaded_file, sens_level):
|
|
169 |
|
170 |
# Define models and predictions
|
171 |
model_names_sf = ['scope_lab1', 'scope_lab2', 'tech_lab1', 'tech_lab3', 'fin_lab2']
|
172 |
-
model_names = model_names_sf + ['ADAPMIT', 'SECTOR', 'LANG']
|
173 |
total_predictions = len(model_names) * len(df)
|
174 |
progress_count = 0
|
175 |
|
@@ -197,6 +205,7 @@ def process_data(uploaded_file, sens_level):
|
|
197 |
df[model_name] = predict_category(df, model_name, progress_bar, repo='classifier_SF_' + model_name, profile='mtyrrell')
|
198 |
elif model_name == 'ADAPMIT':
|
199 |
df[model_name] = predict_category(df, model_name, progress_bar, repo='ADAPMIT-multilabel-bge_f', profile='GIZ')
|
|
|
200 |
elif model_name == 'SECTOR':
|
201 |
sectors_dict = predict_category(df, model_name, progress_bar, repo='SECTOR-multilabel-bge_f', profile='GIZ', multilabel=True)
|
202 |
df['SECTOR1'] = [item['SECTOR1'] for item in sectors_dict]
|
@@ -204,6 +213,8 @@ def process_data(uploaded_file, sens_level):
|
|
204 |
elif model_name == 'LANG':
|
205 |
df[model_name] = predict_category(df, model_name, progress_bar, repo='51-languages-classifier', profile='qanastek')
|
206 |
# df[model_name] = predict_category(df, model_name, progress_bar, repo='xlm-roberta-base-language-detection', profile='papluca')
|
|
|
|
|
207 |
|
208 |
logger.info(f"Completed: {model_name}")
|
209 |
model_progress.empty()
|
|
|
83 |
|
84 |
# Function to call model and run inference for varying classification tasks/models
|
85 |
def predict_category(df, model_name, progress_bar, repo, profile, multilabel=False):
|
86 |
+
device = torch.device("cuda") if torch.cuda.is_available() else (torch.device("mps") if torch.backends.mps.is_built() else torch.device("cpu"))
|
87 |
model_names_sf = ['scope_lab1', 'scope_lab2', 'tech_lab1', 'tech_lab3', 'fin_lab2']
|
88 |
if model_name in model_names_sf:
|
89 |
col_name = re.sub(r'_(.*)', r'_txt', model_name)
|
|
|
91 |
model.to(device)
|
92 |
# Get tokenizer from the model
|
93 |
tokenizer = model.model_body.tokenizer
|
94 |
+
elif model_name == 'ADAPMIT_TECH_TEST':
|
95 |
+
col_name = 'tech_txt'
|
96 |
+
model = pipeline("text-classification",
|
97 |
+
model=profile+"/"+repo,
|
98 |
+
device=device,
|
99 |
+
return_all_scores=multilabel,
|
100 |
+
truncation=True,
|
101 |
+
max_length=512)
|
102 |
else:
|
103 |
col_name = 'scope_txt'
|
104 |
model = pipeline("text-classification",
|
|
|
106 |
device=device,
|
107 |
return_all_scores=multilabel,
|
108 |
truncation=True,
|
109 |
+
max_length=512)
|
110 |
predictions = []
|
111 |
total = len(df)
|
112 |
for i, text in enumerate(df[col_name]):
|
113 |
try:
|
114 |
if model_name in model_names_sf:
|
115 |
# Truncate text for SetFit models
|
116 |
+
encoded = tokenizer(text, truncation=True, max_length=512)
|
117 |
+
truncated_text = tokenizer.decode(encoded['input_ids'])
|
118 |
+
prediction = model(truncated_text)
|
119 |
+
predictions.append(0 if prediction == 'NEGATIVE' else 1)
|
120 |
else:
|
121 |
prediction = model(text)
|
122 |
+
if model_name == 'ADAPMIT' or model_name == 'ADAPMIT_TECH_TEST':
|
123 |
predictions.append(re.sub('Label$', '', prediction[0]['label']))
|
124 |
elif model_name == 'SECTOR':
|
125 |
predictions.append(extract_predicted_labels(prediction[0], threshold=0.5))
|
|
|
177 |
|
178 |
# Define models and predictions
|
179 |
model_names_sf = ['scope_lab1', 'scope_lab2', 'tech_lab1', 'tech_lab3', 'fin_lab2']
|
180 |
+
model_names = model_names_sf + ['ADAPMIT', 'SECTOR', 'LANG','ADAPMIT_TECH_TEST']
|
181 |
total_predictions = len(model_names) * len(df)
|
182 |
progress_count = 0
|
183 |
|
|
|
205 |
df[model_name] = predict_category(df, model_name, progress_bar, repo='classifier_SF_' + model_name, profile='mtyrrell')
|
206 |
elif model_name == 'ADAPMIT':
|
207 |
df[model_name] = predict_category(df, model_name, progress_bar, repo='ADAPMIT-multilabel-bge_f', profile='GIZ')
|
208 |
+
|
209 |
elif model_name == 'SECTOR':
|
210 |
sectors_dict = predict_category(df, model_name, progress_bar, repo='SECTOR-multilabel-bge_f', profile='GIZ', multilabel=True)
|
211 |
df['SECTOR1'] = [item['SECTOR1'] for item in sectors_dict]
|
|
|
213 |
elif model_name == 'LANG':
|
214 |
df[model_name] = predict_category(df, model_name, progress_bar, repo='51-languages-classifier', profile='qanastek')
|
215 |
# df[model_name] = predict_category(df, model_name, progress_bar, repo='xlm-roberta-base-language-detection', profile='papluca')
|
216 |
+
elif model_name == 'ADAPMIT_TECH_TEST':
|
217 |
+
df[model_name] = predict_category(df, model_name, progress_bar, repo='ADAPMIT-multilabel-bge_f', profile='GIZ')
|
218 |
|
219 |
logger.info(f"Completed: {model_name}")
|
220 |
model_progress.empty()
|