mtyrrell commited on
Commit
3a17c13
·
1 Parent(s): 78c3d60

fix for upload with no organization column

Browse files
Files changed (2) hide show
  1. modules/org_count.py +108 -102
  2. modules/utils.py +4 -6
modules/org_count.py CHANGED
@@ -17,114 +17,120 @@ def standardize_organization_names(df):
17
  """
18
  # Make a copy to avoid modifying the original DataFrame
19
  df = df.copy()
20
- logger.info(f"Checking org names")
21
- # Dictionary of organization variations and their standardized names
22
- org_variations = {
23
- 'Adventist Development Relief Agency': ['adventist development'],
24
- 'Asian Development Bank': ['asian development bank'],
25
- 'Association of the Regional Mechanism for Emissions Reductions of Boyacá, Colombia (MRRE)': [' regional mechanism for emissions reductions of boyacá'],
26
- 'BioCarbon Partners (BCP)': ['biocarbon partners'],
27
- 'Biothermica Technologies Inc': ['biothermica tech'],
28
- 'Brazilian Tourist Board': ['brazilian tourist board'],
29
- 'Caribbean Community Climate Change Centre': ['caribbean community climate'],
30
- 'Caritas': ['caritas'],
31
- 'Climate Advocacy International (CAI)': ['climate advocacy int'],
32
- 'Deutsche Gesellschaft für Internationale Zusammenarbeit (GIZ)': ['deutsche gesellschaft für internationale'],
33
- 'Deutsche Sparkassenstiftung (DSIK)': ['deutsche sparkassenstiftung'],
34
- 'East African Centre of Excellence for Renewable Energy and Efficiency (EACREEE)': ['east african centre of excellence for renewable'],
35
- 'Eco-Ideal': ['eco-ideal'],
36
- 'Global Green Growth Institute (GGGI)': ['global green growth'],
37
- 'Inter-American Development Bank (IDB)': ['american development bank'],
38
- 'Iskandar Regional Development Authority (IRDA)': ['iskandar regional'],
39
- 'Islamic Development Bank': ['islamic development bank'],
40
- 'Malaysian Industry Government Group for High Technology (MIGHT)': ['government group for high technology'],
41
- 'Osh Technological University': ['osh technological university','ошский технологический университет'],
42
- 'Oxford Policy Management (OPM)': ['oxford policy management'],
43
- 'Pacific Rim Investment Management': ['pacific rim investment'],
44
- 'Palestinian Energy and Natural Resources Authority (PENRA)': ['palestinian energy and natural'],
45
- 'Secretariat of the Pacific Regional Environment Programme (SPREP)': ['secretariat of the pacific regional environment programme (sprep)'],
46
- 'Serviço Nacional de Aprendizagem Industrial (SENAI)': ['serviço nacional de aprendizagem'],
47
- 'Sumy City Council': ['sumy city council'],
48
- 'Uganda Development Bank Limited (UDBL)': ['uganda development bank'],
49
- 'UN-Habitat': ['united nations human settlement','un-habitat'],
50
- 'United Nations Children\'s Fund (UNICEF)': ['united nations children'],
51
- 'United Nations Conference on Trade and Development (UNCTAD)': ['united nations conference on trade'],
52
- 'United Nations Development Programme (UNDP)': ['united nations development program'],
53
- 'United Nations Economic and Social Commission (ECOSOC)': ['united nations economic and social'],
54
- 'United Nations Environment Programme (UNEP)': ['united nations environment'],
55
- 'United Nations Industrial Development Organization (UNIDO)': ['united nations industrial'],
56
- 'United Nations Office for Project Services (UNOPS)': ['united nations office for project'],
57
- 'World Food Programme (WFP)': ['world food program'],
58
- 'World Resources Institute (WRI)': ['world resources institute'],
59
- 'World Wide Fund for Nature (WWF)': ['world wildlife','world wide fund for nature'],
60
- }
61
 
62
- # Process exact matches first
63
- df['check_name'] = None
64
- for standard_name, variations in org_variations.items():
65
- mask = df['organization'].str.lower().apply(lambda x: any(var in str(x) for var in variations))
66
- df.loc[mask, 'check_name'] = standard_name
67
 
68
- # Dictionary of organization abbreviations
69
- org_abreviations = {
70
- 'Deutsche Gesellschaft für Internationale Zusammenarbeit (GIZ)': ['GIZ'],
71
- 'Deutsche Sparkassenstiftung (DSIK)': ['DSIK'],
72
- 'East African Centre of Excellence for Renewable Energy and Efficiency (EACREEE)': ['EACREEE'],
73
- 'Global Green Growth Institute (GGGI)': ['GGGI'],
74
- 'UN-Habitat': ['UN-Habitat'],
75
- 'United Nations Children\'s Fund (UNICEF)': ['UNICEF'],
76
- 'United Nations Conference on Trade and Development (UNCTAD)': ['UNCTAD'],
77
- 'United Nations Development Programme (UNDP)': ['UNDP'],
78
- 'United Nations Economic and Social Commission (ECOSOC)': ['ECOSOC'],
79
- 'United Nations Environment Programme (UNEP)': ['UNEP'],
80
- 'United Nations Industrial Development Organization (UNIDO)': ['UNIDO'],
81
- 'United Nations Office for Project Services (UNOPS)': ['UNOPS'],
82
- 'World Food Programme (WFP)': ['WFP'],
83
- 'World Resources Institute (WRI)': ['WRI'],
84
- 'World Wide Fund for Nature (WWF)': ['WWF']
85
- }
86
-
87
- # Process abbreviations
88
- df['check_abreviation'] = None
89
- for standard_name, abreviations in org_abreviations.items():
90
- for abreviation in abreviations:
91
- mask = df['organization'].str.contains(abreviation, regex=False, na=False)
92
- df.loc[mask, 'check_abreviation'] = standard_name
93
-
94
- df['org_renamed'] = df.apply(lambda row: row['check_abreviation'] if pd.isnull(row['check_name']) else row['check_name'], axis=1)
95
- df.drop(columns=['check_name', 'check_abreviation'], inplace=True)
96
-
97
- # Process fuzzy matches
98
- unmatched_mask = df['org_renamed'].isna()
99
- threshold = 90
100
-
101
- for idx, row in df[unmatched_mask].iterrows():
102
- org_name = str(row['organization']).lower()
103
- best_match = None
104
- highest_ratio = 0
 
 
 
 
 
105
 
 
 
106
  for standard_name, variations in org_variations.items():
107
- all_forms = [standard_name.lower()] + variations
108
- for variant in all_forms:
109
- ratio = fuzz.ratio(org_name, variant)
110
- if ratio > threshold and ratio > highest_ratio:
111
- highest_ratio = ratio
112
- best_match = standard_name
113
 
114
- if best_match:
115
- df.loc[idx, 'org_renamed'] = best_match
116
-
117
- # Fill remaining empty values with original names
118
- df.loc[df['org_renamed'].isna(), 'org_renamed'] = df.loc[df['org_renamed'].isna(), 'organization']
119
-
120
- # Add concept count
121
- df['concept_count'] = df.groupby('org_renamed').cumcount() + 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
- # Reorder columns with id, organization, org_renamed, concept_count first, followed by all others
124
- cols = ['id', 'organization', 'org_renamed', 'concept_count']
125
- other_cols = [col for col in df.columns if col not in cols]
126
- df = df[cols + other_cols]
127
-
128
  return df
129
 
130
  # Example usage:
 
17
  """
18
  # Make a copy to avoid modifying the original DataFrame
19
  df = df.copy()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
+ # Return DataFrame as-is if 'organization' column is not present
22
+ if 'organization' not in df.columns:
23
+ logger.warning("No 'organization' column found in DataFrame. Returning DataFrame as-is.")
 
 
24
 
25
+ else:
26
+ logger.info(f"Checking org names")
27
+ # Dictionary of organization variations and their standardized names
28
+ org_variations = {
29
+ 'Adventist Development Relief Agency': ['adventist development'],
30
+ 'Asian Development Bank': ['asian development bank'],
31
+ 'Association of the Regional Mechanism for Emissions Reductions of Boyacá, Colombia (MRRE)': [' regional mechanism for emissions reductions of boyacá'],
32
+ 'BioCarbon Partners (BCP)': ['biocarbon partners'],
33
+ 'Biothermica Technologies Inc': ['biothermica tech'],
34
+ 'Brazilian Tourist Board': ['brazilian tourist board'],
35
+ 'Caribbean Community Climate Change Centre': ['caribbean community climate'],
36
+ 'Caritas': ['caritas'],
37
+ 'Climate Advocacy International (CAI)': ['climate advocacy int'],
38
+ 'Deutsche Gesellschaft für Internationale Zusammenarbeit (GIZ)': ['deutsche gesellschaft für internationale'],
39
+ 'Deutsche Sparkassenstiftung (DSIK)': ['deutsche sparkassenstiftung'],
40
+ 'East African Centre of Excellence for Renewable Energy and Efficiency (EACREEE)': ['east african centre of excellence for renewable'],
41
+ 'Eco-Ideal': ['eco-ideal'],
42
+ 'Global Green Growth Institute (GGGI)': ['global green growth'],
43
+ 'Inter-American Development Bank (IDB)': ['american development bank'],
44
+ 'Iskandar Regional Development Authority (IRDA)': ['iskandar regional'],
45
+ 'Islamic Development Bank': ['islamic development bank'],
46
+ 'Malaysian Industry Government Group for High Technology (MIGHT)': ['government group for high technology'],
47
+ 'Osh Technological University': ['osh technological university','ошский технологический университет'],
48
+ 'Oxford Policy Management (OPM)': ['oxford policy management'],
49
+ 'Pacific Rim Investment Management': ['pacific rim investment'],
50
+ 'Palestinian Energy and Natural Resources Authority (PENRA)': ['palestinian energy and natural'],
51
+ 'Secretariat of the Pacific Regional Environment Programme (SPREP)': ['secretariat of the pacific regional environment programme (sprep)'],
52
+ 'Serviço Nacional de Aprendizagem Industrial (SENAI)': ['serviço nacional de aprendizagem'],
53
+ 'Sumy City Council': ['sumy city council'],
54
+ 'Uganda Development Bank Limited (UDBL)': ['uganda development bank'],
55
+ 'UN-Habitat': ['united nations human settlement','un-habitat'],
56
+ 'United Nations Children\'s Fund (UNICEF)': ['united nations children'],
57
+ 'United Nations Conference on Trade and Development (UNCTAD)': ['united nations conference on trade'],
58
+ 'United Nations Development Programme (UNDP)': ['united nations development program'],
59
+ 'United Nations Economic and Social Commission (ECOSOC)': ['united nations economic and social'],
60
+ 'United Nations Environment Programme (UNEP)': ['united nations environment'],
61
+ 'United Nations Industrial Development Organization (UNIDO)': ['united nations industrial'],
62
+ 'United Nations Office for Project Services (UNOPS)': ['united nations office for project'],
63
+ 'World Food Programme (WFP)': ['world food program'],
64
+ 'World Resources Institute (WRI)': ['world resources institute'],
65
+ 'World Wide Fund for Nature (WWF)': ['world wildlife','world wide fund for nature'],
66
+ }
67
 
68
+ # Process exact matches first
69
+ df['check_name'] = None
70
  for standard_name, variations in org_variations.items():
71
+ mask = df['organization'].str.lower().apply(lambda x: any(var in str(x) for var in variations))
72
+ df.loc[mask, 'check_name'] = standard_name
 
 
 
 
73
 
74
+ # Dictionary of organization abbreviations
75
+ org_abreviations = {
76
+ 'Deutsche Gesellschaft für Internationale Zusammenarbeit (GIZ)': ['GIZ'],
77
+ 'Deutsche Sparkassenstiftung (DSIK)': ['DSIK'],
78
+ 'East African Centre of Excellence for Renewable Energy and Efficiency (EACREEE)': ['EACREEE'],
79
+ 'Global Green Growth Institute (GGGI)': ['GGGI'],
80
+ 'UN-Habitat': ['UN-Habitat'],
81
+ 'United Nations Children\'s Fund (UNICEF)': ['UNICEF'],
82
+ 'United Nations Conference on Trade and Development (UNCTAD)': ['UNCTAD'],
83
+ 'United Nations Development Programme (UNDP)': ['UNDP'],
84
+ 'United Nations Economic and Social Commission (ECOSOC)': ['ECOSOC'],
85
+ 'United Nations Environment Programme (UNEP)': ['UNEP'],
86
+ 'United Nations Industrial Development Organization (UNIDO)': ['UNIDO'],
87
+ 'United Nations Office for Project Services (UNOPS)': ['UNOPS'],
88
+ 'World Food Programme (WFP)': ['WFP'],
89
+ 'World Resources Institute (WRI)': ['WRI'],
90
+ 'World Wide Fund for Nature (WWF)': ['WWF']
91
+ }
92
+
93
+ # Process abbreviations
94
+ df['check_abreviation'] = None
95
+ for standard_name, abreviations in org_abreviations.items():
96
+ for abreviation in abreviations:
97
+ mask = df['organization'].str.contains(abreviation, regex=False, na=False)
98
+ df.loc[mask, 'check_abreviation'] = standard_name
99
+
100
+ df['org_renamed'] = df.apply(lambda row: row['check_abreviation'] if pd.isnull(row['check_name']) else row['check_name'], axis=1)
101
+ df.drop(columns=['check_name', 'check_abreviation'], inplace=True)
102
+
103
+ # Process fuzzy matches
104
+ unmatched_mask = df['org_renamed'].isna()
105
+ threshold = 90
106
+
107
+ for idx, row in df[unmatched_mask].iterrows():
108
+ org_name = str(row['organization']).lower()
109
+ best_match = None
110
+ highest_ratio = 0
111
+
112
+ for standard_name, variations in org_variations.items():
113
+ all_forms = [standard_name.lower()] + variations
114
+ for variant in all_forms:
115
+ ratio = fuzz.ratio(org_name, variant)
116
+ if ratio > threshold and ratio > highest_ratio:
117
+ highest_ratio = ratio
118
+ best_match = standard_name
119
+
120
+ if best_match:
121
+ df.loc[idx, 'org_renamed'] = best_match
122
+
123
+ # Fill remaining empty values with original names
124
+ df.loc[df['org_renamed'].isna(), 'org_renamed'] = df.loc[df['org_renamed'].isna(), 'organization']
125
+
126
+ # Add concept count
127
+ df['concept_count'] = df.groupby('org_renamed').cumcount() + 1
128
 
129
+ # Reorder columns with id, organization, org_renamed, concept_count first, followed by all others
130
+ cols = ['id', 'organization', 'org_renamed', 'concept_count']
131
+ other_cols = [col for col in df.columns if col not in cols]
132
+ df = df[cols + other_cols]
133
+
134
  return df
135
 
136
  # Example usage:
modules/utils.py CHANGED
@@ -16,7 +16,6 @@ logger = logging.getLogger(__name__)
16
 
17
  # Function for creating Upload template file
18
  def create_excel():
19
- # Create a workbook and select the active worksheet
20
  wb = Workbook()
21
  sheet = wb.active
22
  sheet.title = "template"
@@ -32,7 +31,7 @@ def create_excel():
32
  sheet.append(columns) # Appending columns to the first row
33
 
34
  # formatting
35
- for c in sheet['A1:J4'][0]:
36
  c.fill = PatternFill('solid', fgColor = 'bad8e1')
37
  c.font = Font(bold=True)
38
 
@@ -220,7 +219,6 @@ def process_data(uploaded_file, sens_level):
220
  f"Estimated time remaining: {estimated_time_remaining:.1f}s"
221
  f" (step {step_count+1} of {len(model_names)})"
222
  )
223
- # estimated_time_remaining_text.write(f'Estimated Time Remaining: {estimated_time_remaining:.0f} seconds (step {step_count+1} of 9)')
224
  else:
225
  estimated_time_remaining_text.write(f'Calculating time remaining... (step {step_count+1} of {len(model_names)})')
226
 
@@ -251,12 +249,12 @@ def process_data(uploaded_file, sens_level):
251
  # Create normalized leverage scale (0-1) where 300% leverage = 1
252
  df['lev_maf_scale'] = df['lev_maf_%'].apply(lambda x: min(x/300, 1) if x > 0 else 0)
253
 
254
- # Further data processing and actions
255
  sector_classes = ['Energy','Transport','Industries']
256
  df['pred_score'] = df.apply(lambda x: round((x['fin_lab2']*2 + x['scope_lab1']*2 + x['scope_lab2']*2 + x['tech_lab1'] + x['tech_lab3']+ x['lev_gt_0']+x['lev_maf_scale'])/10*10,0), axis=1)
257
- # df['pred_action'] = df.apply(lambda x: 'REJECT' if (x['pred_score'] <4 or x['LANG'] != 'en-US' or x['ADAPMIT'] == 'Adaptation' or not ((x['SECTOR1'] in sector_classes) or (x['SECTOR2'] in sector_classes))) else 'REVIEW', axis=1)
258
  df['pred_action'] = df.apply(lambda x:
259
- 'INELIGIBLE' if (x['concept_count'] > 6 or
260
  x['LANG'] != 'en-US' or
261
  x['ADAPMIT'] == 'Adaptation' or
262
  not any(sector in [x['SECTOR1'], x['SECTOR2']] for sector in sector_classes))
 
16
 
17
  # Function for creating Upload template file
18
  def create_excel():
 
19
  wb = Workbook()
20
  sheet = wb.active
21
  sheet.title = "template"
 
31
  sheet.append(columns) # Appending columns to the first row
32
 
33
  # formatting
34
+ for c in sheet['A1:I4'][0]:
35
  c.fill = PatternFill('solid', fgColor = 'bad8e1')
36
  c.font = Font(bold=True)
37
 
 
219
  f"Estimated time remaining: {estimated_time_remaining:.1f}s"
220
  f" (step {step_count+1} of {len(model_names)})"
221
  )
 
222
  else:
223
  estimated_time_remaining_text.write(f'Calculating time remaining... (step {step_count+1} of {len(model_names)})')
224
 
 
249
  # Create normalized leverage scale (0-1) where 300% leverage = 1
250
  df['lev_maf_scale'] = df['lev_maf_%'].apply(lambda x: min(x/300, 1) if x > 0 else 0)
251
 
252
+ # Predict score
253
  sector_classes = ['Energy','Transport','Industries']
254
  df['pred_score'] = df.apply(lambda x: round((x['fin_lab2']*2 + x['scope_lab1']*2 + x['scope_lab2']*2 + x['tech_lab1'] + x['tech_lab3']+ x['lev_gt_0']+x['lev_maf_scale'])/10*10,0), axis=1)
255
+ # labelling logic
256
  df['pred_action'] = df.apply(lambda x:
257
+ 'INELIGIBLE' if (('concept_count' in df.columns and x['concept_count'] > 6) or
258
  x['LANG'] != 'en-US' or
259
  x['ADAPMIT'] == 'Adaptation' or
260
  not any(sector in [x['SECTOR1'], x['SECTOR2']] for sector in sector_classes))