Spaces:

mtyrrell
/

maf_prefilter_app

Running

App Files Files Community

mtyrrell commited on Feb 4

Commit

6829fd5

0 Parent(s):

init new space

Browse files

Files changed (7) hide show

.gitignore +6 -0
README.md +12 -0
app.py +144 -0
images/pipeline.png +0 -0
modules/auth.py +13 -0
modules/utils.py +216 -0
requirements.txt +7 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,6 @@

+.env
+.DS_Store
+*.csv
+*.xlsx
+/testing/
+/modules/__pycache__/

README.md ADDED Viewed

	@@ -0,0 +1,12 @@

+---
+title: MAF Prefilter
+emoji: 🦀
+colorFrom: yellow
+colorTo: red
+sdk: streamlit
+sdk_version: 1.33.0
+app_file: app.py
+pinned: false
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import torch
+try:
+    print(f"Is CUDA available: {torch.cuda.is_available()}")
+    if torch.cuda.is_available():
+        try:
+            print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
+        except Exception as e:
+            print(f"Error getting CUDA device name: {str(e)}")
+    else:
+        print("No CUDA device available - using CPU")
+except Exception as e:
+    print(f"Error checking CUDA availability: {str(e)}")
+    print("Continuing with CPU...")
+import streamlit as st
+import os
+from huggingface_hub import login
+from datetime import datetime
+from modules.auth import validate_login, check_password
+from modules.utils import create_excel, clean_text, extract_predicted_labels, predict_category, process_data
+# Local
+# from dotenv import load_dotenv
+# load_dotenv()
+# Main app logic
+def main():
+    # Temporarily set authentication to True for testing
+    if 'authenticated' not in st.session_state:
+        st.session_state['authenticated'] = True
+    if st.session_state['authenticated']:
+        # Remove login success message for testing
+        hf_token = os.environ["HF_TOKEN"]
+        login(token=hf_token, add_to_git_credential=True)
+        # Initialize session state variables
+        if 'data_processed' not in st.session_state:
+            st.session_state['data_processed'] = False
+            st.session_state['df'] = None
+        # Main Streamlit app
+        st.title('MAF Application Pre-Filtering Tool')
+        # Sidebar (filters)
+        with st.sidebar:
+            with st.expander("ℹ️ - Instructions", expanded=False):
+                st.markdown(
+                    """
+                    1. **Download the Excel Template file (below).**
+                    2. **[OPTIONAL]: Select the desired filtering sensitivity level (below).**
+                    3. **Copy/paste the requisite application data in the template file. Best practice is to 'paste as values'.**
+                    4. **Upload the template file in the area to the right (or click browse files).**
+                    The tool will immediately start processing the uploaded application data. This can take considerable time
+                    depending on the number of applications and the length of text in each. For example, a file with 500 applications
+                    could be expected to take approximately 20 minutes.
+                    ***NOTE (1)** -  you can also simply rename the column headers in your own file. The headers must match the column names in the template for the tool to run properly.*
+                    ***NOTE (2)** - as of April 2024 this app running as a **test version**, NOT on a GPU. So the process can take up to 30 minutes for 20 applications.*
+                    """
+                )
+            # Excel file download
+            st.download_button(
+                label="Download Excel Template",
+                data=create_excel(),
+                file_name="MAF_upload_template.xlsx",
+                mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+            )
+            # get sensitivity level for use in review / reject (ref. process_data function)
+            sens_options = {
+                "Low": 4,
+                "Medium": 5,
+                "High": 7,
+            }
+            sens_input = st.sidebar.radio(label = 'Select the Sensitivity Level [OPTIONAL]',
+                                    help = 'Increasing the level of sensitivity results in more \
+                                    applications being filtered out. At the same time, this also \
+                                    increases the probability of false negatives (FNs). The rate of \
+                                    FNs at the lowest setting is approximately 6 percent, and \
+                                    approaches 13 percent at the highest setting. ',
+                                    options = list(sens_options.keys()),
+                                    horizontal = False)
+            sens_level = sens_options[sens_input]
+        with st.expander("ℹ️ - About this app", expanded=False):
+            st.write(
+                """
+                This tool provides an interface for running an automated preliminary assessment of applications to the MAF call for applications.
+                The tool functions by running selected text fields from the application through a series of 8 LLMs fine-tuned for text classification (ref. diagram below).
+                The resulting output classifications are used to compute a score and a suggested pre-filtering action. The tool has been tested against
+                human assessors and exhibits an extremely low false negative rate (<6%) at a Sensitivity Level of 'Low' (i.e. rejection threshold for predicted score < 4).
+                """)
+            st.image('images/pipeline.png')
+        uploaded_file = st.file_uploader("Select a file containing MAF application pre-filtering data (see instructions in the sidebar)")
+        if uploaded_file is not None:
+            try:
+                if not st.session_state['data_processed']:
+                    st.session_state['df'] = process_data(uploaded_file, sens_level)
+                    st.session_state['data_processed'] = True
+                df = st.session_state['df']
+                # Get the current date
+                current_datetime = datetime.now().strftime('%d-%m-%Y_%H-%M-%S')
+                output_filename = 'processed_applications_'+current_datetime+'.csv'
+                output_file = 'processed_applications.csv'
+                df.to_csv(output_file, index=False)
+                st.download_button(
+                    label="Download data as CSV",
+                    data=open(output_file, 'rb'),
+                    file_name=output_filename,
+                    mime='text/csv',
+                )
+            except:
+                st.error("Failed to process the file. Please ensure your column names match the template file.")
+    # Comment out or remove the else block containing login form
+    # else:
+    #     username = st.text_input("Username")
+    #     password = st.text_input("Password", type="password")
+    #     if st.button("Login"):
+    #         if validate_login(username, password):
+    #             st.session_state['authenticated'] = True
+    #             st.experimental_rerun()
+    #         else:
+    #             st.error("Incorrect username or password")
+# Run the main function
+main()

images/pipeline.png ADDED Viewed

modules/auth.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import os
+import bcrypt
+# Helper functions
+def check_password(provided_password, stored_hash):
+    return bcrypt.checkpw(provided_password.encode(), stored_hash)
+def validate_login(username, password):
+    # Retrieve user's hashed password from environment variables
+    user_hash = os.getenv(username.upper() + '_HASH')  # Assumes an env var like 'USER1_HASH'
+    if user_hash:
+        return check_password(password, user_hash.encode())
+    return False

modules/utils.py ADDED Viewed

	@@ -0,0 +1,216 @@

+import re
+import time
+import pandas as pd
+from io import BytesIO
+import streamlit as st
+import torch
+from setfit import SetFitModel
+from transformers import pipeline
+from openpyxl import Workbook
+from openpyxl.styles import Font, NamedStyle, PatternFill
+from openpyxl.styles.differential import DifferentialStyle
+# Function for creating Upload template file
+def create_excel():
+    # Create a workbook and select the active worksheet
+    wb = Workbook()
+    sheet = wb.active
+    sheet.title = "template"
+    columns = ['id','scope','technology','financial','barrier']
+    sheet.append(columns)  # Appending columns to the first row
+    # formatting
+    for c in sheet['A1:E4'][0]:
+        c.fill = PatternFill('solid', fgColor = 'bad8e1')
+        c.font = Font(bold=True)
+    # Save to a BytesIO object
+    output = BytesIO()
+    wb.save(output)
+    return output.getvalue()
+# Function to clean text
+def clean_text(input_text):
+    cleaned_text = re.sub(r"[^a-zA-Z0-9\s.,:;!?()\-\n]", "", input_text)
+    cleaned_text = re.sub(r"x000D", "", cleaned_text)
+    cleaned_text = re.sub(r"\s+", " ", cleaned_text)
+    cleaned_text = re.sub(r"\n+", "\n", cleaned_text)
+    return cleaned_text
+# # Function for extracting classifications for each SECTOR label
+def extract_predicted_labels(output, ordinal_selection=1, threshold=0.5):
+    # verify output is a list of dictionaries
+    if isinstance(output, list) and all(isinstance(item, dict) for item in output):
+        # filter items with scores above the threshold
+        filtered_items = [item for item in output if item.get('score', 0) > threshold]
+        # sort the filtered items by score in descending order
+        sorted_items = sorted(filtered_items, key=lambda x: x.get('score', 0), reverse=True)
+        # extract the highest and second-highest labels
+        if len(sorted_items) >= 2:
+            highest_label = sorted_items[0].get('label')
+            second_highest_label = sorted_items[1].get('label')
+        elif len(sorted_items) == 1:
+            highest_label = sorted_items[0].get('label')
+            second_highest_label = None
+        else:
+            print("Warning: Less than two items above the threshold in the current list.")
+            highest_label = None
+            second_highest_label = None
+    else:
+        print("Error: Inner data is not formatted correctly. Each item must be a dictionary.")
+        highest_label = None
+        second_highest_label = None
+    # Output dictionary of highest and second-highest labels to the all_predicted_labels list
+    predicted_labels = {"SECTOR1": highest_label, "SECTOR2": second_highest_label}
+    return predicted_labels
+# Function to call model and run inference for varying classification tasks/models
+def predict_category(df, model_name, progress_bar, repo, profile, multilabel=False):
+    device = torch.device("cuda") if torch.cuda.is_available() else (torch.device("mps") if torch.has_mps else torch.device("cpu"))
+    # model_names_sf = ['scope_lab1', 'scope_lab2', 'tech_lab1', 'tech_lab3', 'fin_lab2','bar_lab2']
+    model_names_sf = ['scope_lab1', 'scope_lab2', 'tech_lab1', 'tech_lab3', 'fin_lab2']
+    if model_name in model_names_sf:
+        col_name = re.sub(r'_(.*)', r'_txt', model_name)
+        model = SetFitModel.from_pretrained(profile+"/"+repo)
+        model.to(device)
+    else:
+        col_name = 'scope_txt'
+        model = pipeline("text-classification", model=profile+"/"+repo, device=device, return_all_scores=multilabel)
+    predictions = []
+    total = len(df)
+    for i, text in enumerate(df[col_name]):
+        prediction = model(text)
+        if model_name in model_names_sf:
+            predictions.append(0 if prediction == 'NEGATIVE' else 1)
+        elif model_name == 'ADAPMIT':
+            predictions.append(re.sub('Label$', '', prediction[0]['label']))
+        elif model_name == 'SECTOR':
+            predictions.append(extract_predicted_labels(prediction[0], threshold=0.5))
+        elif model_name == 'LANG':
+            predictions.append(prediction[0]['label'])
+        # Update progress bar with each iteration
+        progress = (i + 1) / total
+        progress_bar.progress(progress)
+    # st.write(predictions)
+    return predictions
+# Main function to process data
+def process_data(uploaded_file, sens_level):
+    df = pd.read_excel(uploaded_file)
+    # Column renaming and initial processing
+    df.rename(columns={
+        'id': 'id',
+        'scope': 'scope_txt',
+        'technology': 'tech_txt',
+        'financial': 'fin_txt',
+        'barrier': 'bar_txt',
+        'maf_funding_requested':'maf_funding',
+        'contributions_public_sector':'cont_public',
+        'contributions_private_sector':'cont_private',
+        'contributions_other':'cont_other'}, inplace=True)
+    # df = df.filter(['id', 'scope_txt', 'tech_txt', 'fin_txt', 'bar_txt'])
+    # df.fillna('', inplace=True)
+    # df[['scope_txt', 'tech_txt', 'fin_txt', 'bar_txt']] = df[['scope_txt', 'tech_txt', 'fin_txt', 'bar_txt']].applymap(clean_text)
+    df = df.filter(['id', 'scope_txt', 'tech_txt', 'fin_txt','maf_funding','cont_public','cont_private','cont_other'])
+    df.fillna('', inplace=True)
+    df[['scope_txt', 'tech_txt', 'fin_txt']] = df[['scope_txt', 'tech_txt', 'fin_txt']].applymap(clean_text)
+    # Define models and predictions
+    model_names_sf = ['scope_lab1', 'scope_lab2', 'tech_lab1', 'tech_lab3', 'fin_lab2']
+    model_names = model_names_sf + ['ADAPMIT', 'SECTOR', 'LANG']
+    total_predictions = len(model_names) * len(df)
+    progress_count = 0
+    # UI setup for progress tracking
+    st.subheader("Overall Progress:")
+    patience_text = st.empty()
+    patience_text.markdown("*You may want to grab a coffee, this can take a while...*")
+    overall_progress = st.progress(0)
+    overall_start_time = time.time()
+    estimated_time_remaining_text = st.empty()
+    # Model processing
+    step_count = 0
+    total_steps = len(model_names)
+    for model_name in model_names:
+        step_count += 1
+        model_processing_text = st.empty()
+        model_processing_text.markdown(f'**Current Task: Processing with model "{model_name}"**')
+        model_progress = st.empty()
+        progress_bar = model_progress.progress(0)
+        # Load the model and run inference
+        if model_name in model_names_sf:
+            df[model_name] = predict_category(df, model_name, progress_bar, repo='classifier_SF_' + model_name, profile='mtyrrell')
+        elif model_name == 'ADAPMIT':
+            df[model_name] = predict_category(df, model_name, progress_bar, repo='ADAPMIT-multilabel-bge_f', profile='GIZ')
+        elif model_name == 'SECTOR':
+            sectors_dict = predict_category(df, model_name, progress_bar, repo='SECTOR-multilabel-bge_f', profile='GIZ', multilabel=True)
+            df['SECTOR1'] = [item['SECTOR1'] for item in sectors_dict]
+            df['SECTOR2'] = [item['SECTOR2'] for item in sectors_dict]
+        elif model_name == 'LANG':
+            df[model_name] = predict_category(df, model_name, progress_bar, repo='51-languages-classifier', profile='qanastek')
+        model_progress.empty()
+        progress_count += len(df)
+        overall_progress_value = progress_count / total_predictions
+        overall_progress.progress(overall_progress_value)
+        # Calculate and display estimated time remaining
+        elapsed_time = time.time() - overall_start_time
+        steps_remaining = total_steps - step_count
+        if step_count > 1:
+            estimated_time_remaining = (elapsed_time / step_count) * steps_remaining
+            estimated_time_remaining_text.write(f'Estimated Time Remaining: {estimated_time_remaining:.0f} seconds (step {step_count+1} of 9)')
+        else:
+            estimated_time_remaining_text.write(f'Calculating time remaining... (step {step_count+1} of 9)')
+        model_processing_text.empty()
+    patience_text.empty()
+    estimated_time_remaining_text.empty()
+    st.write(f'Processing complete. Total time: {elapsed_time:.1f} seconds')
+    # Convert funding columns to numeric, replacing any non-numeric values with NaN
+    df['maf_funding'] = pd.to_numeric(df['maf_funding'], errors='coerce')
+    df['cont_public'] = pd.to_numeric(df['cont_public'], errors='coerce')
+    df['cont_private'] = pd.to_numeric(df['cont_private'], errors='coerce')
+    df['cont_other'] = pd.to_numeric(df['cont_other'], errors='coerce')
+    # Fill any NaN values with 0
+    df[['maf_funding', 'cont_public', 'cont_private', 'cont_other']] = df[['maf_funding', 'cont_public', 'cont_private', 'cont_other']].fillna(0)
+    df['lev_total'] = df.apply(lambda x: x['cont_public'] + x['cont_private'] + x['cont_other'], axis=1)
+    df['lev_gt_maf'] = df.apply(lambda x: 'True' if x['lev_total'] > x['maf_funding'] else 'False', axis=1)
+    df['lev_gt_0'] = (df['lev_total'] > 0).astype(int)
+    # Calculate leverage as percentage of MAF funding
+    df['lev_maf_%'] = df.apply(lambda x: round(x['lev_total']/x['maf_funding']*100,2) if x['maf_funding'] != 0 else 0, axis=1)
+    # Create normalized leverage scale (0-1) where 300% leverage = 1
+    df['lev_maf_scale'] = df['lev_maf_%'].apply(lambda x: min(x/300, 1) if x > 0 else 0)
+    # Further data processing and actions
+    sector_classes = ['Energy','Transport','Industries']
+    # df['pred_score'] = df.apply(lambda x: round((x['fin_lab2']*2 + x['scope_lab1']*2 + x['scope_lab2']*2 + x['tech_lab1'] + x['tech_lab3'] + x['bar_lab2'])/9*10,0), axis=1)
+    df['pred_score'] = df.apply(lambda x: round((x['fin_lab2']*2 + x['scope_lab1']*2 + x['scope_lab2']*2 + x['tech_lab1'] + x['tech_lab3']+ x['lev_gt_0'])/9*10,0), axis=1)
+    df['pred_action'] = df.apply(lambda x: 'REJECT' if (x['pred_score'] <4 or x['LANG'] != 'en-US' or x['ADAPMIT'] == 'Adaptation' or not ((x['SECTOR1'] in sector_classes) or (x['SECTOR2'] in sector_classes))) else 'REVIEW', axis=1)
+    return df

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+streamlit
+pandas
+openpyxl
+setfit
+bcrypt
+--extra-index-url https://download.pytorch.org/whl/cu113
+torch