Spaces:

mtyrrell
/

maf_prefilter_app

Running

App Files Files Community

mtyrrell commited on Feb 14

Commit

07660e6

1 Parent(s): bcd4037

test ADAPTMIT_TEXT

Browse files

Files changed (2) hide show

app.py +12 -12
modules/utils.py +19 -8

app.py CHANGED Viewed

@@ -26,15 +26,15 @@ from io import BytesIO
 logger = logging.getLogger(__name__)
 # Local
-# from dotenv import load_dotenv
-# load_dotenv()
 # Main app logic
 def main():
     # Temporarily set authentication to True for testing
     if 'authenticated' not in st.session_state:
-        st.session_state['authenticated'] = False
     if st.session_state['authenticated']:
         # Remove login success message for testing
@@ -183,15 +183,15 @@ def main():
     # Comment out for testing
-    else:
-        username = st.text_input("Username")
-        password = st.text_input("Password", type="password")
-        if st.button("Login"):
-            if validate_login(username, password):
-                st.session_state['authenticated'] = True
-                st.rerun()
-            else:
-                st.error("Incorrect username or password")

 logger = logging.getLogger(__name__)
 # Local
+from dotenv import load_dotenv
+load_dotenv()
 # Main app logic
 def main():
     # Temporarily set authentication to True for testing
     if 'authenticated' not in st.session_state:
+        st.session_state['authenticated'] = True
     if st.session_state['authenticated']:
         # Remove login success message for testing
     # Comment out for testing
+    # else:
+    #     username = st.text_input("Username")
+    #     password = st.text_input("Password", type="password")
+    #     if st.button("Login"):
+    #         if validate_login(username, password):
+    #             st.session_state['authenticated'] = True
+    #             st.rerun()
+    #         else:
+    #             st.error("Incorrect username or password")

modules/utils.py CHANGED Viewed

@@ -83,7 +83,7 @@ def extract_predicted_labels(output, ordinal_selection=1, threshold=0.5):
 # Function to call model and run inference for varying classification tasks/models
 def predict_category(df, model_name, progress_bar, repo, profile, multilabel=False):
-    device = torch.device("cuda") if torch.cuda.is_available() else (torch.device("mps") if torch.has_mps else torch.device("cpu"))
     model_names_sf = ['scope_lab1', 'scope_lab2', 'tech_lab1', 'tech_lab3', 'fin_lab2']
     if model_name in model_names_sf:
         col_name = re.sub(r'_(.*)', r'_txt', model_name)
@@ -91,6 +91,14 @@ def predict_category(df, model_name, progress_bar, repo, profile, multilabel=Fal
         model.to(device)
         # Get tokenizer from the model
         tokenizer = model.model_body.tokenizer
     else:
         col_name = 'scope_txt'
         model = pipeline("text-classification",
@@ -98,20 +106,20 @@ def predict_category(df, model_name, progress_bar, repo, profile, multilabel=Fal
                         device=device,
                         return_all_scores=multilabel,
                         truncation=True,
-                        max_length=512)
     predictions = []
     total = len(df)
     for i, text in enumerate(df[col_name]):
         try:
             if model_name in model_names_sf:
                 # Truncate text for SetFit models
-                encoded = tokenizer(text, truncation=True, max_length=512)
-                truncated_text = tokenizer.decode(encoded['input_ids'])
-                prediction = model(truncated_text)
-                predictions.append(0 if prediction == 'NEGATIVE' else 1)
             else:
                 prediction = model(text)
-                if model_name == 'ADAPMIT':
                     predictions.append(re.sub('Label$', '', prediction[0]['label']))
                 elif model_name == 'SECTOR':
                     predictions.append(extract_predicted_labels(prediction[0], threshold=0.5))
@@ -169,7 +177,7 @@ def process_data(uploaded_file, sens_level):
     # Define models and predictions
     model_names_sf = ['scope_lab1', 'scope_lab2', 'tech_lab1', 'tech_lab3', 'fin_lab2']
-    model_names = model_names_sf + ['ADAPMIT', 'SECTOR', 'LANG']
     total_predictions = len(model_names) * len(df)
     progress_count = 0
@@ -197,6 +205,7 @@ def process_data(uploaded_file, sens_level):
             df[model_name] = predict_category(df, model_name, progress_bar, repo='classifier_SF_' + model_name, profile='mtyrrell')
         elif model_name == 'ADAPMIT':
             df[model_name] = predict_category(df, model_name, progress_bar, repo='ADAPMIT-multilabel-bge_f', profile='GIZ')
         elif model_name == 'SECTOR':
             sectors_dict = predict_category(df, model_name, progress_bar, repo='SECTOR-multilabel-bge_f', profile='GIZ', multilabel=True)
             df['SECTOR1'] = [item['SECTOR1'] for item in sectors_dict]
@@ -204,6 +213,8 @@ def process_data(uploaded_file, sens_level):
         elif model_name == 'LANG':
             df[model_name] = predict_category(df, model_name, progress_bar, repo='51-languages-classifier', profile='qanastek')
             # df[model_name] = predict_category(df, model_name, progress_bar, repo='xlm-roberta-base-language-detection', profile='papluca')
         logger.info(f"Completed: {model_name}")
         model_progress.empty()

 # Function to call model and run inference for varying classification tasks/models
 def predict_category(df, model_name, progress_bar, repo, profile, multilabel=False):
+    device = torch.device("cuda") if torch.cuda.is_available() else (torch.device("mps") if torch.backends.mps.is_built() else torch.device("cpu"))
     model_names_sf = ['scope_lab1', 'scope_lab2', 'tech_lab1', 'tech_lab3', 'fin_lab2']
     if model_name in model_names_sf:
         col_name = re.sub(r'_(.*)', r'_txt', model_name)
         model.to(device)
         # Get tokenizer from the model
         tokenizer = model.model_body.tokenizer
+    elif model_name == 'ADAPMIT_TECH_TEST':
+        col_name = 'tech_txt'
+        model = pipeline("text-classification",
+                model=profile+"/"+repo,
+                device=device,
+                return_all_scores=multilabel,
+                truncation=True,
+                max_length=512)
     else:
         col_name = 'scope_txt'
         model = pipeline("text-classification",
                         device=device,
                         return_all_scores=multilabel,
                         truncation=True,
+                        max_length=512)
     predictions = []
     total = len(df)
     for i, text in enumerate(df[col_name]):
         try:
             if model_name in model_names_sf:
                 # Truncate text for SetFit models
+                    encoded = tokenizer(text, truncation=True, max_length=512)
+                    truncated_text = tokenizer.decode(encoded['input_ids'])
+                    prediction = model(truncated_text)
+                    predictions.append(0 if prediction == 'NEGATIVE' else 1)
             else:
                 prediction = model(text)
+                if model_name == 'ADAPMIT' or model_name == 'ADAPMIT_TECH_TEST':
                     predictions.append(re.sub('Label$', '', prediction[0]['label']))
                 elif model_name == 'SECTOR':
                     predictions.append(extract_predicted_labels(prediction[0], threshold=0.5))
     # Define models and predictions
     model_names_sf = ['scope_lab1', 'scope_lab2', 'tech_lab1', 'tech_lab3', 'fin_lab2']
+    model_names = model_names_sf + ['ADAPMIT', 'SECTOR', 'LANG','ADAPMIT_TECH_TEST']
     total_predictions = len(model_names) * len(df)
     progress_count = 0
             df[model_name] = predict_category(df, model_name, progress_bar, repo='classifier_SF_' + model_name, profile='mtyrrell')
         elif model_name == 'ADAPMIT':
             df[model_name] = predict_category(df, model_name, progress_bar, repo='ADAPMIT-multilabel-bge_f', profile='GIZ')
         elif model_name == 'SECTOR':
             sectors_dict = predict_category(df, model_name, progress_bar, repo='SECTOR-multilabel-bge_f', profile='GIZ', multilabel=True)
             df['SECTOR1'] = [item['SECTOR1'] for item in sectors_dict]
         elif model_name == 'LANG':
             df[model_name] = predict_category(df, model_name, progress_bar, repo='51-languages-classifier', profile='qanastek')
             # df[model_name] = predict_category(df, model_name, progress_bar, repo='xlm-roberta-base-language-detection', profile='papluca')
+        elif model_name == 'ADAPMIT_TECH_TEST':
+            df[model_name] = predict_category(df, model_name, progress_bar, repo='ADAPMIT-multilabel-bge_f', profile='GIZ')
         logger.info(f"Completed: {model_name}")
         model_progress.empty()