Spaces:

mtyrrell
/

maf_prefilter_app

Running

App Files Files Community

mtyrrell commited on Feb 7

Commit

c572984

1 Parent(s): 88e08d0

word length logic; improved lang classifier

Browse files

Files changed (3) hide show

app.py +24 -13
modules/org_count.py +4 -2
modules/utils.py +12 -13

app.py CHANGED Viewed

@@ -26,15 +26,15 @@ from io import BytesIO
 logger = logging.getLogger(__name__)
 # Local
-from dotenv import load_dotenv
-load_dotenv()
 # Main app logic
 def main():
     # Temporarily set authentication to True for testing
     if 'authenticated' not in st.session_state:
-        st.session_state['authenticated'] = True
     if st.session_state['authenticated']:
         # Remove login success message for testing
@@ -136,9 +136,20 @@ def main():
                         st.session_state['df'] = process_data(uploaded_file, sens_level)
                         logger.info("Data processing completed successfully")
                         st.session_state['data_processed'] = True
                     except Exception as e:
                         logger.error(f"Error in process_data: {str(e)}")
-                        raise
                 df = st.session_state['df']
@@ -172,15 +183,15 @@ def main():
     # Comment out for testing
-    # else:
-    #     username = st.text_input("Username")
-    #     password = st.text_input("Password", type="password")
-    #     if st.button("Login"):
-    #         if validate_login(username, password):
-    #             st.session_state['authenticated'] = True
-    #             st.rerun()
-    #         else:
-    #             st.error("Incorrect username or password")

 logger = logging.getLogger(__name__)
 # Local
+# from dotenv import load_dotenv
+# load_dotenv()
 # Main app logic
 def main():
     # Temporarily set authentication to True for testing
     if 'authenticated' not in st.session_state:
+        st.session_state['authenticated'] = False
     if st.session_state['authenticated']:
         # Remove login success message for testing
                         st.session_state['df'] = process_data(uploaded_file, sens_level)
                         logger.info("Data processing completed successfully")
                         st.session_state['data_processed'] = True
+                    except ValueError as e:
+                        # Handle specific validation errors
+                        logger.error(f"Validation error: {str(e)}")
+                        st.error(str(e))
+                        st.session_state['show_button'] = True
+                        st.session_state['processing'] = False
+                        st.rerun()
                     except Exception as e:
+                        # Handle other unexpected errors
                         logger.error(f"Error in process_data: {str(e)}")
+                        st.error("An unexpected error occurred. Please check your input file and try again.")
+                        st.session_state['show_button'] = True
+                        st.session_state['processing'] = False
+                        st.rerun()
                 df = st.session_state['df']
     # Comment out for testing
+    else:
+        username = st.text_input("Username")
+        password = st.text_input("Password", type="password")
+        if st.button("Login"):
+            if validate_login(username, password):
+                st.session_state['authenticated'] = True
+                st.rerun()
+            else:
+                st.error("Incorrect username or password")

modules/org_count.py CHANGED Viewed

@@ -17,6 +17,9 @@ def standardize_organization_names(df):
     """
     # Make a copy to avoid modifying the original DataFrame
     df = df.copy()
     # Return DataFrame as-is if 'organization' column is not present
     if 'organization' not in df.columns:
@@ -55,7 +58,7 @@ def standardize_organization_names(df):
             'Serviço Nacional de Aprendizagem Industrial (SENAI)': ['serviço nacional de aprendizagem'],
             'Sumy City Council': ['sumy city council'],
             'Uganda Development Bank Limited (UDBL)': ['uganda development bank'],
-            'UN-Habitat': ['united nations human settlement','un-habitat'],
             'United Nations Children\'s Fund (UNICEF)': ['united nations children'],
             'United Nations Conference on Trade and Development (UNCTAD)': ['united nations conference on trade'],
             'United Nations Development Programme (UNDP)': ['united nations development program'],
@@ -81,7 +84,6 @@ def standardize_organization_names(df):
             'Development Initiative for Community Impact (DICI)': ['DICI'],
             'East African Centre of Excellence for Renewable Energy and Efficiency (EACREEE)': ['EACREEE'],
             'Global Green Growth Institute (GGGI)': ['GGGI'],
-            'UN-Habitat': ['UN-Habitat'],
             'United Nations Children\'s Fund (UNICEF)': ['UNICEF'],
             'United Nations Conference on Trade and Development (UNCTAD)': ['UNCTAD'],
             'United Nations Development Programme (UNDP)': ['UNDP'],

     """
     # Make a copy to avoid modifying the original DataFrame
     df = df.copy()
+    # Sort DataFrame by 'id' column in ascending order
+    df = df.sort_values('id', ascending=True)
     # Return DataFrame as-is if 'organization' column is not present
     if 'organization' not in df.columns:
             'Serviço Nacional de Aprendizagem Industrial (SENAI)': ['serviço nacional de aprendizagem'],
             'Sumy City Council': ['sumy city council'],
             'Uganda Development Bank Limited (UDBL)': ['uganda development bank'],
+            'United Nations Human Settlement Programme (UN-Habitat)': ['united nations human settlement','un-habitat'],
             'United Nations Children\'s Fund (UNICEF)': ['united nations children'],
             'United Nations Conference on Trade and Development (UNCTAD)': ['united nations conference on trade'],
             'United Nations Development Programme (UNDP)': ['united nations development program'],
             'Development Initiative for Community Impact (DICI)': ['DICI'],
             'East African Centre of Excellence for Renewable Energy and Efficiency (EACREEE)': ['EACREEE'],
             'Global Green Growth Institute (GGGI)': ['GGGI'],
             'United Nations Children\'s Fund (UNICEF)': ['UNICEF'],
             'United Nations Conference on Trade and Development (UNCTAD)': ['UNCTAD'],
             'United Nations Development Programme (UNDP)': ['UNDP'],

modules/utils.py CHANGED Viewed

@@ -143,12 +143,14 @@ def process_data(uploaded_file, sens_level):
     # Read the Excel file
     try:
         df = pd.read_excel(uploaded_file)
-        df = standardize_organization_names(df)
         logger.info("Data import successful")
     except Exception as e:
-        logger.error(f"Failed to read Excel file: {str(e)}")
         st.error("Failed to read the uploaded file. Please ensure it's a valid Excel file.")
-        return None
     # Validate required columns
     missing_columns = [col for col in required_columns.keys() if col not in df.columns]
@@ -156,7 +158,7 @@ def process_data(uploaded_file, sens_level):
         error_msg = f"Missing required columns: {', '.join(missing_columns)}"
         logger.error(error_msg)
         st.error(error_msg)
-        return None
     # Rename required columns while preserving all others
     df = df.rename(columns={k: v for k, v in required_columns.items() if k in df.columns})
@@ -250,23 +252,20 @@ def process_data(uploaded_file, sens_level):
     # Create normalized leverage scale (0-1) where 300% leverage = 1
     df['lev_maf_scale'] = df['lev_maf_%'].apply(lambda x: min(x/300, 1) if x > 0 else 0)
-    # Check if text fields have minimum required words
-    df['scope_words_lt_10'] = df['scope_txt'].str.split().str.len() < 10
-    df['fin_words_lt_10'] = df['fin_txt'].str.split().str.len() < 10
-    df['tech_words_lt_10'] = df['tech_txt'].str.split().str.len() < 10
     df['word_length_check'] = df.apply(lambda x:
-        True if x['scope_txt'].str.split().str.len() < 10 and
-            x['fin_txt'].str.split().str.len() < 10 and
-            x['tech_txt'].str.split().str.len() < 10
             else False, axis=1)
     # Predict score
     sector_classes = ['Energy','Transport','Industries']
     df['pred_score'] = df.apply(lambda x: round((x['fin_lab2']*2 + x['scope_lab1']*2 + x['scope_lab2']*2 + x['tech_lab1'] + x['tech_lab3']+ x['lev_gt_0']+x['lev_maf_scale'])/10*10,0), axis=1)
     # labelling logic
     df['pred_action'] = df.apply(lambda x:
         'INELIGIBLE' if (('concept_count' in df.columns and x['concept_count'] > 6) or
-                        x['LANG'] != 'en-US' or
                         x['ADAPMIT'] == 'Adaptation' or
                         not any(sector in [x['SECTOR1'], x['SECTOR2']] for sector in sector_classes) or
                         x['word_length_check'] == True)

     # Read the Excel file
     try:
         df = pd.read_excel(uploaded_file)
         logger.info("Data import successful")
+        df = standardize_organization_names(df)
     except Exception as e:
+        error_msg = f"Failed to read Excel file: {str(e)}"
+        logger.error(error_msg)
         st.error("Failed to read the uploaded file. Please ensure it's a valid Excel file.")
+        raise ValueError(error_msg)
     # Validate required columns
     missing_columns = [col for col in required_columns.keys() if col not in df.columns]
         error_msg = f"Missing required columns: {', '.join(missing_columns)}"
         logger.error(error_msg)
         st.error(error_msg)
+        raise ValueError(error_msg)
     # Rename required columns while preserving all others
     df = df.rename(columns={k: v for k, v in required_columns.items() if k in df.columns})
     # Create normalized leverage scale (0-1) where 300% leverage = 1
     df['lev_maf_scale'] = df['lev_maf_%'].apply(lambda x: min(x/300, 1) if x > 0 else 0)
+    # Test if all text fields don't have minimum required words
     df['word_length_check'] = df.apply(lambda x:
+        True if len(x['scope_txt'].split()) < 10 and
+            len(x['fin_txt'].split()) < 10 and
+            len(x['tech_txt'].split()) < 10
             else False, axis=1)
     # Predict score
     sector_classes = ['Energy','Transport','Industries']
     df['pred_score'] = df.apply(lambda x: round((x['fin_lab2']*2 + x['scope_lab1']*2 + x['scope_lab2']*2 + x['tech_lab1'] + x['tech_lab3']+ x['lev_gt_0']+x['lev_maf_scale'])/10*10,0), axis=1)
     # labelling logic
     df['pred_action'] = df.apply(lambda x:
         'INELIGIBLE' if (('concept_count' in df.columns and x['concept_count'] > 6) or
+                        x['LANG'] != 'en' or
                         x['ADAPMIT'] == 'Adaptation' or
                         not any(sector in [x['SECTOR1'], x['SECTOR2']] for sector in sector_classes) or
                         x['word_length_check'] == True)