Spaces:

mtyrrell
/

maf_prefilter_app

Running

App Files Files Community

mtyrrell commited on Feb 6

Commit

bf83e2c

1 Parent(s): ab78519

organization count

Browse files

Files changed (3) hide show

app.py +6 -7
modules/org_count.py +9 -1
modules/utils.py +33 -11

app.py CHANGED Viewed

@@ -12,18 +12,17 @@ except Exception as e:
     print(f"Error checking CUDA availability: {str(e)}")
     print("Continuing with CPU...")
-from modules.logging_config import setup_logging
-setup_logging()
-import logging
-logger = logging.getLogger(__name__)
 import streamlit as st
 import os
 from huggingface_hub import login
 from datetime import datetime
 from modules.auth import validate_login
 from modules.utils import create_excel, clean_text, extract_predicted_labels, predict_category, process_data
 # Local
 # from dotenv import load_dotenv
@@ -64,7 +63,7 @@ def main():
                     depending on the number of applications and the length of text in each. For example, a file with 1000 applications
                     could be expected to take approximately 5 minutes.
-                    ***NOTE (1)** -  you can also simply rename the column headers in your own file. The headers must match the column names in the template for the tool to run properly.*
                     """
                 )

     print(f"Error checking CUDA availability: {str(e)}")
     print("Continuing with CPU...")
 import streamlit as st
 import os
 from huggingface_hub import login
 from datetime import datetime
 from modules.auth import validate_login
 from modules.utils import create_excel, clean_text, extract_predicted_labels, predict_category, process_data
+from modules.logging_config import setup_logging
+setup_logging()
+import logging
+logger = logging.getLogger(__name__)
 # Local
 # from dotenv import load_dotenv
                     depending on the number of applications and the length of text in each. For example, a file with 1000 applications
                     could be expected to take approximately 5 minutes.
+                    ***NOTE** -  you can also simply rename the column headers in your own file. The headers must match the column names in the template for the tool to run properly.*
                     """
                 )

modules/org_count.py CHANGED Viewed

@@ -1,5 +1,8 @@
 import pandas as pd
 from thefuzz import fuzz
 def standardize_organization_names(df):
@@ -14,7 +17,7 @@ def standardize_organization_names(df):
     """
     # Make a copy to avoid modifying the original DataFrame
     df = df.copy()
     # Dictionary of organization variations and their standardized names
     org_variations = {
         'Adventist Development Relief Agency': ['adventist development'],
@@ -116,6 +119,11 @@ def standardize_organization_names(df):
     # Add concept count
     df['concept_count'] = df.groupby('org_renamed').cumcount() + 1
     return df

 import pandas as pd
 from thefuzz import fuzz
+import logging
+logger = logging.getLogger(__name__)
 def standardize_organization_names(df):
     """
     # Make a copy to avoid modifying the original DataFrame
     df = df.copy()
+    logger.info(f"Checking org names")
     # Dictionary of organization variations and their standardized names
     org_variations = {
         'Adventist Development Relief Agency': ['adventist development'],
     # Add concept count
     df['concept_count'] = df.groupby('org_renamed').cumcount() + 1
+    # Reorder columns with id, organization, org_renamed, concept_count first, followed by all others
+    cols = ['id', 'organization', 'org_renamed', 'concept_count']
+    other_cols = [col for col in df.columns if col not in cols]
+    df = df[cols + other_cols]
     return df

modules/utils.py CHANGED Viewed

@@ -9,6 +9,7 @@ from transformers import pipeline
 from openpyxl import Workbook
 from openpyxl.styles import Font, NamedStyle, PatternFill
 from openpyxl.styles.differential import DifferentialStyle
 import logging
 logger = logging.getLogger(__name__)
@@ -127,20 +128,40 @@ def predict_category(df, model_name, progress_bar, repo, profile, multilabel=Fal
 # Main function to process data
 def process_data(uploaded_file, sens_level):
-    df = pd.read_excel(uploaded_file)
-    logger.info(f"Data import successful")
-    # Rename columns
-    df.rename(columns={
         'id': 'id',
         'scope': 'scope_txt',
         'technology': 'tech_txt',
         'financial': 'fin_txt',
-        'maf_funding_requested':'maf_funding',
-        'contributions_public_sector':'cont_public',
-        'contributions_private_sector':'cont_private',
-        'contributions_other':'cont_other'}, inplace=True)
-    # clean the text fields
-    df = df.filter(['id', 'scope_txt', 'tech_txt', 'fin_txt','maf_funding','cont_public','cont_private','cont_other'])
     df.fillna('', inplace=True)
     df[['scope_txt', 'tech_txt', 'fin_txt']] = df[['scope_txt', 'tech_txt', 'fin_txt']].applymap(clean_text)
@@ -234,7 +255,8 @@ def process_data(uploaded_file, sens_level):
     df['pred_score'] = df.apply(lambda x: round((x['fin_lab2']*2 + x['scope_lab1']*2 + x['scope_lab2']*2 + x['tech_lab1'] + x['tech_lab3']+ x['lev_gt_0']+x['lev_maf_scale'])/10*10,0), axis=1)
     # df['pred_action'] = df.apply(lambda x: 'REJECT' if (x['pred_score'] <4 or x['LANG'] != 'en-US' or x['ADAPMIT'] == 'Adaptation' or not ((x['SECTOR1'] in sector_classes) or (x['SECTOR2'] in sector_classes))) else 'REVIEW', axis=1)
     df['pred_action'] = df.apply(lambda x:
-        'INELIGIBLE' if (x['LANG'] != 'en-US' or
                         x['ADAPMIT'] == 'Adaptation' or
                         not any(sector in [x['SECTOR1'], x['SECTOR2']] for sector in sector_classes))
         else 'REJECT' if x['pred_score'] <= sens_level

 from openpyxl import Workbook
 from openpyxl.styles import Font, NamedStyle, PatternFill
 from openpyxl.styles.differential import DifferentialStyle
+from modules.org_count import standardize_organization_names
 import logging
 logger = logging.getLogger(__name__)
 # Main function to process data
 def process_data(uploaded_file, sens_level):
+    # Define required columns and their mappings
+    required_columns = {
         'id': 'id',
         'scope': 'scope_txt',
         'technology': 'tech_txt',
         'financial': 'fin_txt',
+        'maf_funding_requested': 'maf_funding',
+        'contributions_public_sector': 'cont_public',
+        'contributions_private_sector': 'cont_private',
+        'contributions_other': 'cont_other'
+    }
+    # Read the Excel file
+    try:
+        df = pd.read_excel(uploaded_file)
+        df = standardize_organization_names(df)
+        logger.info("Data import successful")
+    except Exception as e:
+        logger.error(f"Failed to read Excel file: {str(e)}")
+        st.error("Failed to read the uploaded file. Please ensure it's a valid Excel file.")
+        return None
+    # Validate required columns
+    missing_columns = [col for col in required_columns.keys() if col not in df.columns]
+    if missing_columns:
+        error_msg = f"Missing required columns: {', '.join(missing_columns)}"
+        logger.error(error_msg)
+        st.error(error_msg)
+        return None
+    # Rename required columns while preserving all others
+    df = df.rename(columns={k: v for k, v in required_columns.items() if k in df.columns})
+    # Clean and process text fields
     df.fillna('', inplace=True)
     df[['scope_txt', 'tech_txt', 'fin_txt']] = df[['scope_txt', 'tech_txt', 'fin_txt']].applymap(clean_text)
     df['pred_score'] = df.apply(lambda x: round((x['fin_lab2']*2 + x['scope_lab1']*2 + x['scope_lab2']*2 + x['tech_lab1'] + x['tech_lab3']+ x['lev_gt_0']+x['lev_maf_scale'])/10*10,0), axis=1)
     # df['pred_action'] = df.apply(lambda x: 'REJECT' if (x['pred_score'] <4 or x['LANG'] != 'en-US' or x['ADAPMIT'] == 'Adaptation' or not ((x['SECTOR1'] in sector_classes) or (x['SECTOR2'] in sector_classes))) else 'REVIEW', axis=1)
     df['pred_action'] = df.apply(lambda x:
+        'INELIGIBLE' if (x['concept_count'] > 6 or
+                        x['LANG'] != 'en-US' or
                         x['ADAPMIT'] == 'Adaptation' or
                         not any(sector in [x['SECTOR1'], x['SECTOR2']] for sector in sector_classes))
         else 'REJECT' if x['pred_score'] <= sens_level