Spaces:

mtyrrell
/

maf_prefilter_app

Running

App Files Files Community

mtyrrell commited on Feb 6

Commit

3a17c13

1 Parent(s): 78c3d60

fix for upload with no organization column

Browse files

Files changed (2) hide show

modules/org_count.py +108 -102
modules/utils.py +4 -6

modules/org_count.py CHANGED Viewed

@@ -17,114 +17,120 @@ def standardize_organization_names(df):
     """
     # Make a copy to avoid modifying the original DataFrame
     df = df.copy()
-    logger.info(f"Checking org names")
-    # Dictionary of organization variations and their standardized names
-    org_variations = {
-        'Adventist Development Relief Agency': ['adventist development'],
-        'Asian Development Bank': ['asian development bank'],
-        'Association of the Regional Mechanism for Emissions Reductions of Boyacá, Colombia (MRRE)': [' regional mechanism for emissions reductions of boyacá'],
-        'BioCarbon Partners (BCP)': ['biocarbon partners'],
-        'Biothermica Technologies Inc': ['biothermica tech'],
-        'Brazilian Tourist Board': ['brazilian tourist board'],
-        'Caribbean Community Climate Change Centre': ['caribbean community climate'],
-        'Caritas': ['caritas'],
-        'Climate Advocacy International (CAI)': ['climate advocacy int'],
-        'Deutsche Gesellschaft für Internationale Zusammenarbeit (GIZ)': ['deutsche gesellschaft für internationale'],
-        'Deutsche Sparkassenstiftung (DSIK)': ['deutsche sparkassenstiftung'],
-        'East African Centre of Excellence for Renewable Energy and Efficiency (EACREEE)': ['east african centre of excellence for renewable'],
-        'Eco-Ideal': ['eco-ideal'],
-        'Global Green Growth Institute (GGGI)': ['global green growth'],
-        'Inter-American Development Bank (IDB)': ['american development bank'],
-        'Iskandar Regional Development Authority (IRDA)': ['iskandar regional'],
-        'Islamic Development Bank': ['islamic development bank'],
-        'Malaysian Industry Government Group for High Technology (MIGHT)': ['government group for high technology'],
-        'Osh Technological University': ['osh technological university','ошский технологический университет'],
-        'Oxford Policy Management (OPM)': ['oxford policy management'],
-        'Pacific Rim Investment Management': ['pacific rim investment'],
-        'Palestinian Energy and Natural Resources Authority (PENRA)': ['palestinian energy and natural'],
-        'Secretariat of the Pacific Regional Environment Programme (SPREP)': ['secretariat of the pacific regional environment programme (sprep)'],
-        'Serviço Nacional de Aprendizagem Industrial (SENAI)': ['serviço nacional de aprendizagem'],
-        'Sumy City Council': ['sumy city council'],
-        'Uganda Development Bank Limited (UDBL)': ['uganda development bank'],
-        'UN-Habitat': ['united nations human settlement','un-habitat'],
-        'United Nations Children\'s Fund (UNICEF)': ['united nations children'],
-        'United Nations Conference on Trade and Development (UNCTAD)': ['united nations conference on trade'],
-        'United Nations Development Programme (UNDP)': ['united nations development program'],
-        'United Nations Economic and Social Commission (ECOSOC)': ['united nations economic and social'],
-        'United Nations Environment Programme (UNEP)': ['united nations environment'],
-        'United Nations Industrial Development Organization (UNIDO)': ['united nations industrial'],
-        'United Nations Office for Project Services (UNOPS)': ['united nations office for project'],
-        'World Food Programme (WFP)': ['world food program'],
-        'World Resources Institute (WRI)': ['world resources institute'],
-        'World Wide Fund for Nature (WWF)': ['world wildlife','world wide fund for nature'],
-    }
-    # Process exact matches first
-    df['check_name'] = None
-    for standard_name, variations in org_variations.items():
-        mask = df['organization'].str.lower().apply(lambda x: any(var in str(x) for var in variations))
-        df.loc[mask, 'check_name'] = standard_name
-    # Dictionary of organization abbreviations
-    org_abreviations = {
-        'Deutsche Gesellschaft für Internationale Zusammenarbeit (GIZ)': ['GIZ'],
-        'Deutsche Sparkassenstiftung (DSIK)': ['DSIK'],
-        'East African Centre of Excellence for Renewable Energy and Efficiency (EACREEE)': ['EACREEE'],
-        'Global Green Growth Institute (GGGI)': ['GGGI'],
-        'UN-Habitat': ['UN-Habitat'],
-        'United Nations Children\'s Fund (UNICEF)': ['UNICEF'],
-        'United Nations Conference on Trade and Development (UNCTAD)': ['UNCTAD'],
-        'United Nations Development Programme (UNDP)': ['UNDP'],
-        'United Nations Economic and Social Commission (ECOSOC)': ['ECOSOC'],
-        'United Nations Environment Programme (UNEP)': ['UNEP'],
-        'United Nations Industrial Development Organization (UNIDO)': ['UNIDO'],
-        'United Nations Office for Project Services (UNOPS)': ['UNOPS'],
-        'World Food Programme (WFP)': ['WFP'],
-        'World Resources Institute (WRI)': ['WRI'],
-        'World Wide Fund for Nature (WWF)': ['WWF']
-    }
-    # Process abbreviations
-    df['check_abreviation'] = None
-    for standard_name, abreviations in org_abreviations.items():
-        for abreviation in abreviations:
-            mask = df['organization'].str.contains(abreviation, regex=False, na=False)
-            df.loc[mask, 'check_abreviation'] = standard_name
-    df['org_renamed'] = df.apply(lambda row: row['check_abreviation'] if pd.isnull(row['check_name']) else row['check_name'], axis=1)
-    df.drop(columns=['check_name', 'check_abreviation'], inplace=True)
-    # Process fuzzy matches
-    unmatched_mask = df['org_renamed'].isna()
-    threshold = 90
-    for idx, row in df[unmatched_mask].iterrows():
-        org_name = str(row['organization']).lower()
-        best_match = None
-        highest_ratio = 0
         for standard_name, variations in org_variations.items():
-            all_forms = [standard_name.lower()] + variations
-            for variant in all_forms:
-                ratio = fuzz.ratio(org_name, variant)
-                if ratio > threshold and ratio > highest_ratio:
-                    highest_ratio = ratio
-                    best_match = standard_name
-        if best_match:
-            df.loc[idx, 'org_renamed'] = best_match
-    # Fill remaining empty values with original names
-    df.loc[df['org_renamed'].isna(), 'org_renamed'] = df.loc[df['org_renamed'].isna(), 'organization']
-    # Add concept count
-    df['concept_count'] = df.groupby('org_renamed').cumcount() + 1
-    # Reorder columns with id, organization, org_renamed, concept_count first, followed by all others
-    cols = ['id', 'organization', 'org_renamed', 'concept_count']
-    other_cols = [col for col in df.columns if col not in cols]
-    df = df[cols + other_cols]
     return df
 # Example usage:

     """
     # Make a copy to avoid modifying the original DataFrame
     df = df.copy()
+    # Return DataFrame as-is if 'organization' column is not present
+    if 'organization' not in df.columns:
+        logger.warning("No 'organization' column found in DataFrame. Returning DataFrame as-is.")
+    else:
+        logger.info(f"Checking org names")
+        # Dictionary of organization variations and their standardized names
+        org_variations = {
+            'Adventist Development Relief Agency': ['adventist development'],
+            'Asian Development Bank': ['asian development bank'],
+            'Association of the Regional Mechanism for Emissions Reductions of Boyacá, Colombia (MRRE)': [' regional mechanism for emissions reductions of boyacá'],
+            'BioCarbon Partners (BCP)': ['biocarbon partners'],
+            'Biothermica Technologies Inc': ['biothermica tech'],
+            'Brazilian Tourist Board': ['brazilian tourist board'],
+            'Caribbean Community Climate Change Centre': ['caribbean community climate'],
+            'Caritas': ['caritas'],
+            'Climate Advocacy International (CAI)': ['climate advocacy int'],
+            'Deutsche Gesellschaft für Internationale Zusammenarbeit (GIZ)': ['deutsche gesellschaft für internationale'],
+            'Deutsche Sparkassenstiftung (DSIK)': ['deutsche sparkassenstiftung'],
+            'East African Centre of Excellence for Renewable Energy and Efficiency (EACREEE)': ['east african centre of excellence for renewable'],
+            'Eco-Ideal': ['eco-ideal'],
+            'Global Green Growth Institute (GGGI)': ['global green growth'],
+            'Inter-American Development Bank (IDB)': ['american development bank'],
+            'Iskandar Regional Development Authority (IRDA)': ['iskandar regional'],
+            'Islamic Development Bank': ['islamic development bank'],
+            'Malaysian Industry Government Group for High Technology (MIGHT)': ['government group for high technology'],
+            'Osh Technological University': ['osh technological university','ошский технологический университет'],
+            'Oxford Policy Management (OPM)': ['oxford policy management'],
+            'Pacific Rim Investment Management': ['pacific rim investment'],
+            'Palestinian Energy and Natural Resources Authority (PENRA)': ['palestinian energy and natural'],
+            'Secretariat of the Pacific Regional Environment Programme (SPREP)': ['secretariat of the pacific regional environment programme (sprep)'],
+            'Serviço Nacional de Aprendizagem Industrial (SENAI)': ['serviço nacional de aprendizagem'],
+            'Sumy City Council': ['sumy city council'],
+            'Uganda Development Bank Limited (UDBL)': ['uganda development bank'],
+            'UN-Habitat': ['united nations human settlement','un-habitat'],
+            'United Nations Children\'s Fund (UNICEF)': ['united nations children'],
+            'United Nations Conference on Trade and Development (UNCTAD)': ['united nations conference on trade'],
+            'United Nations Development Programme (UNDP)': ['united nations development program'],
+            'United Nations Economic and Social Commission (ECOSOC)': ['united nations economic and social'],
+            'United Nations Environment Programme (UNEP)': ['united nations environment'],
+            'United Nations Industrial Development Organization (UNIDO)': ['united nations industrial'],
+            'United Nations Office for Project Services (UNOPS)': ['united nations office for project'],
+            'World Food Programme (WFP)': ['world food program'],
+            'World Resources Institute (WRI)': ['world resources institute'],
+            'World Wide Fund for Nature (WWF)': ['world wildlife','world wide fund for nature'],
+        }
+        # Process exact matches first
+        df['check_name'] = None
         for standard_name, variations in org_variations.items():
+            mask = df['organization'].str.lower().apply(lambda x: any(var in str(x) for var in variations))
+            df.loc[mask, 'check_name'] = standard_name
+        # Dictionary of organization abbreviations
+        org_abreviations = {
+            'Deutsche Gesellschaft für Internationale Zusammenarbeit (GIZ)': ['GIZ'],
+            'Deutsche Sparkassenstiftung (DSIK)': ['DSIK'],
+            'East African Centre of Excellence for Renewable Energy and Efficiency (EACREEE)': ['EACREEE'],
+            'Global Green Growth Institute (GGGI)': ['GGGI'],
+            'UN-Habitat': ['UN-Habitat'],
+            'United Nations Children\'s Fund (UNICEF)': ['UNICEF'],
+            'United Nations Conference on Trade and Development (UNCTAD)': ['UNCTAD'],
+            'United Nations Development Programme (UNDP)': ['UNDP'],
+            'United Nations Economic and Social Commission (ECOSOC)': ['ECOSOC'],
+            'United Nations Environment Programme (UNEP)': ['UNEP'],
+            'United Nations Industrial Development Organization (UNIDO)': ['UNIDO'],
+            'United Nations Office for Project Services (UNOPS)': ['UNOPS'],
+            'World Food Programme (WFP)': ['WFP'],
+            'World Resources Institute (WRI)': ['WRI'],
+            'World Wide Fund for Nature (WWF)': ['WWF']
+        }
+        # Process abbreviations
+        df['check_abreviation'] = None
+        for standard_name, abreviations in org_abreviations.items():
+            for abreviation in abreviations:
+                mask = df['organization'].str.contains(abreviation, regex=False, na=False)
+                df.loc[mask, 'check_abreviation'] = standard_name
+        df['org_renamed'] = df.apply(lambda row: row['check_abreviation'] if pd.isnull(row['check_name']) else row['check_name'], axis=1)
+        df.drop(columns=['check_name', 'check_abreviation'], inplace=True)
+        # Process fuzzy matches
+        unmatched_mask = df['org_renamed'].isna()
+        threshold = 90
+        for idx, row in df[unmatched_mask].iterrows():
+            org_name = str(row['organization']).lower()
+            best_match = None
+            highest_ratio = 0
+            for standard_name, variations in org_variations.items():
+                all_forms = [standard_name.lower()] + variations
+                for variant in all_forms:
+                    ratio = fuzz.ratio(org_name, variant)
+                    if ratio > threshold and ratio > highest_ratio:
+                        highest_ratio = ratio
+                        best_match = standard_name
+            if best_match:
+                df.loc[idx, 'org_renamed'] = best_match
+        # Fill remaining empty values with original names
+        df.loc[df['org_renamed'].isna(), 'org_renamed'] = df.loc[df['org_renamed'].isna(), 'organization']
+        # Add concept count
+        df['concept_count'] = df.groupby('org_renamed').cumcount() + 1
+        # Reorder columns with id, organization, org_renamed, concept_count first, followed by all others
+        cols = ['id', 'organization', 'org_renamed', 'concept_count']
+        other_cols = [col for col in df.columns if col not in cols]
+        df = df[cols + other_cols]
     return df
 # Example usage:

modules/utils.py CHANGED Viewed

@@ -16,7 +16,6 @@ logger = logging.getLogger(__name__)
 # Function for creating Upload template file
 def create_excel():
-    # Create a workbook and select the active worksheet
     wb = Workbook()
     sheet = wb.active
     sheet.title = "template"
@@ -32,7 +31,7 @@ def create_excel():
     sheet.append(columns)  # Appending columns to the first row
     # formatting
-    for c in sheet['A1:J4'][0]:
         c.fill = PatternFill('solid', fgColor = 'bad8e1')
         c.font = Font(bold=True)
@@ -220,7 +219,6 @@ def process_data(uploaded_file, sens_level):
                 f"Estimated time remaining: {estimated_time_remaining:.1f}s"
                 f" (step {step_count+1} of {len(model_names)})"
             )
-            # estimated_time_remaining_text.write(f'Estimated Time Remaining: {estimated_time_remaining:.0f} seconds (step {step_count+1} of 9)')
         else:
             estimated_time_remaining_text.write(f'Calculating time remaining... (step {step_count+1} of {len(model_names)})')
@@ -251,12 +249,12 @@ def process_data(uploaded_file, sens_level):
     # Create normalized leverage scale (0-1) where 300% leverage = 1
     df['lev_maf_scale'] = df['lev_maf_%'].apply(lambda x: min(x/300, 1) if x > 0 else 0)
-    # Further data processing and actions
     sector_classes = ['Energy','Transport','Industries']
     df['pred_score'] = df.apply(lambda x: round((x['fin_lab2']*2 + x['scope_lab1']*2 + x['scope_lab2']*2 + x['tech_lab1'] + x['tech_lab3']+ x['lev_gt_0']+x['lev_maf_scale'])/10*10,0), axis=1)
-    # df['pred_action'] = df.apply(lambda x: 'REJECT' if (x['pred_score'] <4 or x['LANG'] != 'en-US' or x['ADAPMIT'] == 'Adaptation' or not ((x['SECTOR1'] in sector_classes) or (x['SECTOR2'] in sector_classes))) else 'REVIEW', axis=1)
     df['pred_action'] = df.apply(lambda x:
-        'INELIGIBLE' if (x['concept_count'] > 6 or
                         x['LANG'] != 'en-US' or
                         x['ADAPMIT'] == 'Adaptation' or
                         not any(sector in [x['SECTOR1'], x['SECTOR2']] for sector in sector_classes))

 # Function for creating Upload template file
 def create_excel():
     wb = Workbook()
     sheet = wb.active
     sheet.title = "template"
     sheet.append(columns)  # Appending columns to the first row
     # formatting
+    for c in sheet['A1:I4'][0]:
         c.fill = PatternFill('solid', fgColor = 'bad8e1')
         c.font = Font(bold=True)
                 f"Estimated time remaining: {estimated_time_remaining:.1f}s"
                 f" (step {step_count+1} of {len(model_names)})"
             )
         else:
             estimated_time_remaining_text.write(f'Calculating time remaining... (step {step_count+1} of {len(model_names)})')
     # Create normalized leverage scale (0-1) where 300% leverage = 1
     df['lev_maf_scale'] = df['lev_maf_%'].apply(lambda x: min(x/300, 1) if x > 0 else 0)
+    # Predict score
     sector_classes = ['Energy','Transport','Industries']
     df['pred_score'] = df.apply(lambda x: round((x['fin_lab2']*2 + x['scope_lab1']*2 + x['scope_lab2']*2 + x['tech_lab1'] + x['tech_lab3']+ x['lev_gt_0']+x['lev_maf_scale'])/10*10,0), axis=1)
+    # labelling logic
     df['pred_action'] = df.apply(lambda x:
+        'INELIGIBLE' if (('concept_count' in df.columns and x['concept_count'] > 6) or
                         x['LANG'] != 'en-US' or
                         x['ADAPMIT'] == 'Adaptation' or
                         not any(sector in [x['SECTOR1'], x['SECTOR2']] for sector in sector_classes))