Spaces:
Running
Running
fix for upload with no organization column
Browse files- modules/org_count.py +108 -102
- modules/utils.py +4 -6
modules/org_count.py
CHANGED
@@ -17,114 +17,120 @@ def standardize_organization_names(df):
|
|
17 |
"""
|
18 |
# Make a copy to avoid modifying the original DataFrame
|
19 |
df = df.copy()
|
20 |
-
logger.info(f"Checking org names")
|
21 |
-
# Dictionary of organization variations and their standardized names
|
22 |
-
org_variations = {
|
23 |
-
'Adventist Development Relief Agency': ['adventist development'],
|
24 |
-
'Asian Development Bank': ['asian development bank'],
|
25 |
-
'Association of the Regional Mechanism for Emissions Reductions of Boyacá, Colombia (MRRE)': [' regional mechanism for emissions reductions of boyacá'],
|
26 |
-
'BioCarbon Partners (BCP)': ['biocarbon partners'],
|
27 |
-
'Biothermica Technologies Inc': ['biothermica tech'],
|
28 |
-
'Brazilian Tourist Board': ['brazilian tourist board'],
|
29 |
-
'Caribbean Community Climate Change Centre': ['caribbean community climate'],
|
30 |
-
'Caritas': ['caritas'],
|
31 |
-
'Climate Advocacy International (CAI)': ['climate advocacy int'],
|
32 |
-
'Deutsche Gesellschaft für Internationale Zusammenarbeit (GIZ)': ['deutsche gesellschaft für internationale'],
|
33 |
-
'Deutsche Sparkassenstiftung (DSIK)': ['deutsche sparkassenstiftung'],
|
34 |
-
'East African Centre of Excellence for Renewable Energy and Efficiency (EACREEE)': ['east african centre of excellence for renewable'],
|
35 |
-
'Eco-Ideal': ['eco-ideal'],
|
36 |
-
'Global Green Growth Institute (GGGI)': ['global green growth'],
|
37 |
-
'Inter-American Development Bank (IDB)': ['american development bank'],
|
38 |
-
'Iskandar Regional Development Authority (IRDA)': ['iskandar regional'],
|
39 |
-
'Islamic Development Bank': ['islamic development bank'],
|
40 |
-
'Malaysian Industry Government Group for High Technology (MIGHT)': ['government group for high technology'],
|
41 |
-
'Osh Technological University': ['osh technological university','ошский технологический университет'],
|
42 |
-
'Oxford Policy Management (OPM)': ['oxford policy management'],
|
43 |
-
'Pacific Rim Investment Management': ['pacific rim investment'],
|
44 |
-
'Palestinian Energy and Natural Resources Authority (PENRA)': ['palestinian energy and natural'],
|
45 |
-
'Secretariat of the Pacific Regional Environment Programme (SPREP)': ['secretariat of the pacific regional environment programme (sprep)'],
|
46 |
-
'Serviço Nacional de Aprendizagem Industrial (SENAI)': ['serviço nacional de aprendizagem'],
|
47 |
-
'Sumy City Council': ['sumy city council'],
|
48 |
-
'Uganda Development Bank Limited (UDBL)': ['uganda development bank'],
|
49 |
-
'UN-Habitat': ['united nations human settlement','un-habitat'],
|
50 |
-
'United Nations Children\'s Fund (UNICEF)': ['united nations children'],
|
51 |
-
'United Nations Conference on Trade and Development (UNCTAD)': ['united nations conference on trade'],
|
52 |
-
'United Nations Development Programme (UNDP)': ['united nations development program'],
|
53 |
-
'United Nations Economic and Social Commission (ECOSOC)': ['united nations economic and social'],
|
54 |
-
'United Nations Environment Programme (UNEP)': ['united nations environment'],
|
55 |
-
'United Nations Industrial Development Organization (UNIDO)': ['united nations industrial'],
|
56 |
-
'United Nations Office for Project Services (UNOPS)': ['united nations office for project'],
|
57 |
-
'World Food Programme (WFP)': ['world food program'],
|
58 |
-
'World Resources Institute (WRI)': ['world resources institute'],
|
59 |
-
'World Wide Fund for Nature (WWF)': ['world wildlife','world wide fund for nature'],
|
60 |
-
}
|
61 |
|
62 |
-
#
|
63 |
-
|
64 |
-
|
65 |
-
mask = df['organization'].str.lower().apply(lambda x: any(var in str(x) for var in variations))
|
66 |
-
df.loc[mask, 'check_name'] = standard_name
|
67 |
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
|
|
|
|
|
|
|
|
|
|
105 |
|
|
|
|
|
106 |
for standard_name, variations in org_variations.items():
|
107 |
-
|
108 |
-
|
109 |
-
ratio = fuzz.ratio(org_name, variant)
|
110 |
-
if ratio > threshold and ratio > highest_ratio:
|
111 |
-
highest_ratio = ratio
|
112 |
-
best_match = standard_name
|
113 |
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
return df
|
129 |
|
130 |
# Example usage:
|
|
|
17 |
"""
|
18 |
# Make a copy to avoid modifying the original DataFrame
|
19 |
df = df.copy()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
+
# Return DataFrame as-is if 'organization' column is not present
|
22 |
+
if 'organization' not in df.columns:
|
23 |
+
logger.warning("No 'organization' column found in DataFrame. Returning DataFrame as-is.")
|
|
|
|
|
24 |
|
25 |
+
else:
|
26 |
+
logger.info(f"Checking org names")
|
27 |
+
# Dictionary of organization variations and their standardized names
|
28 |
+
org_variations = {
|
29 |
+
'Adventist Development Relief Agency': ['adventist development'],
|
30 |
+
'Asian Development Bank': ['asian development bank'],
|
31 |
+
'Association of the Regional Mechanism for Emissions Reductions of Boyacá, Colombia (MRRE)': [' regional mechanism for emissions reductions of boyacá'],
|
32 |
+
'BioCarbon Partners (BCP)': ['biocarbon partners'],
|
33 |
+
'Biothermica Technologies Inc': ['biothermica tech'],
|
34 |
+
'Brazilian Tourist Board': ['brazilian tourist board'],
|
35 |
+
'Caribbean Community Climate Change Centre': ['caribbean community climate'],
|
36 |
+
'Caritas': ['caritas'],
|
37 |
+
'Climate Advocacy International (CAI)': ['climate advocacy int'],
|
38 |
+
'Deutsche Gesellschaft für Internationale Zusammenarbeit (GIZ)': ['deutsche gesellschaft für internationale'],
|
39 |
+
'Deutsche Sparkassenstiftung (DSIK)': ['deutsche sparkassenstiftung'],
|
40 |
+
'East African Centre of Excellence for Renewable Energy and Efficiency (EACREEE)': ['east african centre of excellence for renewable'],
|
41 |
+
'Eco-Ideal': ['eco-ideal'],
|
42 |
+
'Global Green Growth Institute (GGGI)': ['global green growth'],
|
43 |
+
'Inter-American Development Bank (IDB)': ['american development bank'],
|
44 |
+
'Iskandar Regional Development Authority (IRDA)': ['iskandar regional'],
|
45 |
+
'Islamic Development Bank': ['islamic development bank'],
|
46 |
+
'Malaysian Industry Government Group for High Technology (MIGHT)': ['government group for high technology'],
|
47 |
+
'Osh Technological University': ['osh technological university','ошский технологический университет'],
|
48 |
+
'Oxford Policy Management (OPM)': ['oxford policy management'],
|
49 |
+
'Pacific Rim Investment Management': ['pacific rim investment'],
|
50 |
+
'Palestinian Energy and Natural Resources Authority (PENRA)': ['palestinian energy and natural'],
|
51 |
+
'Secretariat of the Pacific Regional Environment Programme (SPREP)': ['secretariat of the pacific regional environment programme (sprep)'],
|
52 |
+
'Serviço Nacional de Aprendizagem Industrial (SENAI)': ['serviço nacional de aprendizagem'],
|
53 |
+
'Sumy City Council': ['sumy city council'],
|
54 |
+
'Uganda Development Bank Limited (UDBL)': ['uganda development bank'],
|
55 |
+
'UN-Habitat': ['united nations human settlement','un-habitat'],
|
56 |
+
'United Nations Children\'s Fund (UNICEF)': ['united nations children'],
|
57 |
+
'United Nations Conference on Trade and Development (UNCTAD)': ['united nations conference on trade'],
|
58 |
+
'United Nations Development Programme (UNDP)': ['united nations development program'],
|
59 |
+
'United Nations Economic and Social Commission (ECOSOC)': ['united nations economic and social'],
|
60 |
+
'United Nations Environment Programme (UNEP)': ['united nations environment'],
|
61 |
+
'United Nations Industrial Development Organization (UNIDO)': ['united nations industrial'],
|
62 |
+
'United Nations Office for Project Services (UNOPS)': ['united nations office for project'],
|
63 |
+
'World Food Programme (WFP)': ['world food program'],
|
64 |
+
'World Resources Institute (WRI)': ['world resources institute'],
|
65 |
+
'World Wide Fund for Nature (WWF)': ['world wildlife','world wide fund for nature'],
|
66 |
+
}
|
67 |
|
68 |
+
# Process exact matches first
|
69 |
+
df['check_name'] = None
|
70 |
for standard_name, variations in org_variations.items():
|
71 |
+
mask = df['organization'].str.lower().apply(lambda x: any(var in str(x) for var in variations))
|
72 |
+
df.loc[mask, 'check_name'] = standard_name
|
|
|
|
|
|
|
|
|
73 |
|
74 |
+
# Dictionary of organization abbreviations
|
75 |
+
org_abreviations = {
|
76 |
+
'Deutsche Gesellschaft für Internationale Zusammenarbeit (GIZ)': ['GIZ'],
|
77 |
+
'Deutsche Sparkassenstiftung (DSIK)': ['DSIK'],
|
78 |
+
'East African Centre of Excellence for Renewable Energy and Efficiency (EACREEE)': ['EACREEE'],
|
79 |
+
'Global Green Growth Institute (GGGI)': ['GGGI'],
|
80 |
+
'UN-Habitat': ['UN-Habitat'],
|
81 |
+
'United Nations Children\'s Fund (UNICEF)': ['UNICEF'],
|
82 |
+
'United Nations Conference on Trade and Development (UNCTAD)': ['UNCTAD'],
|
83 |
+
'United Nations Development Programme (UNDP)': ['UNDP'],
|
84 |
+
'United Nations Economic and Social Commission (ECOSOC)': ['ECOSOC'],
|
85 |
+
'United Nations Environment Programme (UNEP)': ['UNEP'],
|
86 |
+
'United Nations Industrial Development Organization (UNIDO)': ['UNIDO'],
|
87 |
+
'United Nations Office for Project Services (UNOPS)': ['UNOPS'],
|
88 |
+
'World Food Programme (WFP)': ['WFP'],
|
89 |
+
'World Resources Institute (WRI)': ['WRI'],
|
90 |
+
'World Wide Fund for Nature (WWF)': ['WWF']
|
91 |
+
}
|
92 |
+
|
93 |
+
# Process abbreviations
|
94 |
+
df['check_abreviation'] = None
|
95 |
+
for standard_name, abreviations in org_abreviations.items():
|
96 |
+
for abreviation in abreviations:
|
97 |
+
mask = df['organization'].str.contains(abreviation, regex=False, na=False)
|
98 |
+
df.loc[mask, 'check_abreviation'] = standard_name
|
99 |
+
|
100 |
+
df['org_renamed'] = df.apply(lambda row: row['check_abreviation'] if pd.isnull(row['check_name']) else row['check_name'], axis=1)
|
101 |
+
df.drop(columns=['check_name', 'check_abreviation'], inplace=True)
|
102 |
+
|
103 |
+
# Process fuzzy matches
|
104 |
+
unmatched_mask = df['org_renamed'].isna()
|
105 |
+
threshold = 90
|
106 |
+
|
107 |
+
for idx, row in df[unmatched_mask].iterrows():
|
108 |
+
org_name = str(row['organization']).lower()
|
109 |
+
best_match = None
|
110 |
+
highest_ratio = 0
|
111 |
+
|
112 |
+
for standard_name, variations in org_variations.items():
|
113 |
+
all_forms = [standard_name.lower()] + variations
|
114 |
+
for variant in all_forms:
|
115 |
+
ratio = fuzz.ratio(org_name, variant)
|
116 |
+
if ratio > threshold and ratio > highest_ratio:
|
117 |
+
highest_ratio = ratio
|
118 |
+
best_match = standard_name
|
119 |
+
|
120 |
+
if best_match:
|
121 |
+
df.loc[idx, 'org_renamed'] = best_match
|
122 |
+
|
123 |
+
# Fill remaining empty values with original names
|
124 |
+
df.loc[df['org_renamed'].isna(), 'org_renamed'] = df.loc[df['org_renamed'].isna(), 'organization']
|
125 |
+
|
126 |
+
# Add concept count
|
127 |
+
df['concept_count'] = df.groupby('org_renamed').cumcount() + 1
|
128 |
|
129 |
+
# Reorder columns with id, organization, org_renamed, concept_count first, followed by all others
|
130 |
+
cols = ['id', 'organization', 'org_renamed', 'concept_count']
|
131 |
+
other_cols = [col for col in df.columns if col not in cols]
|
132 |
+
df = df[cols + other_cols]
|
133 |
+
|
134 |
return df
|
135 |
|
136 |
# Example usage:
|
modules/utils.py
CHANGED
@@ -16,7 +16,6 @@ logger = logging.getLogger(__name__)
|
|
16 |
|
17 |
# Function for creating Upload template file
|
18 |
def create_excel():
|
19 |
-
# Create a workbook and select the active worksheet
|
20 |
wb = Workbook()
|
21 |
sheet = wb.active
|
22 |
sheet.title = "template"
|
@@ -32,7 +31,7 @@ def create_excel():
|
|
32 |
sheet.append(columns) # Appending columns to the first row
|
33 |
|
34 |
# formatting
|
35 |
-
for c in sheet['A1:
|
36 |
c.fill = PatternFill('solid', fgColor = 'bad8e1')
|
37 |
c.font = Font(bold=True)
|
38 |
|
@@ -220,7 +219,6 @@ def process_data(uploaded_file, sens_level):
|
|
220 |
f"Estimated time remaining: {estimated_time_remaining:.1f}s"
|
221 |
f" (step {step_count+1} of {len(model_names)})"
|
222 |
)
|
223 |
-
# estimated_time_remaining_text.write(f'Estimated Time Remaining: {estimated_time_remaining:.0f} seconds (step {step_count+1} of 9)')
|
224 |
else:
|
225 |
estimated_time_remaining_text.write(f'Calculating time remaining... (step {step_count+1} of {len(model_names)})')
|
226 |
|
@@ -251,12 +249,12 @@ def process_data(uploaded_file, sens_level):
|
|
251 |
# Create normalized leverage scale (0-1) where 300% leverage = 1
|
252 |
df['lev_maf_scale'] = df['lev_maf_%'].apply(lambda x: min(x/300, 1) if x > 0 else 0)
|
253 |
|
254 |
-
#
|
255 |
sector_classes = ['Energy','Transport','Industries']
|
256 |
df['pred_score'] = df.apply(lambda x: round((x['fin_lab2']*2 + x['scope_lab1']*2 + x['scope_lab2']*2 + x['tech_lab1'] + x['tech_lab3']+ x['lev_gt_0']+x['lev_maf_scale'])/10*10,0), axis=1)
|
257 |
-
#
|
258 |
df['pred_action'] = df.apply(lambda x:
|
259 |
-
'INELIGIBLE' if (x['concept_count'] > 6 or
|
260 |
x['LANG'] != 'en-US' or
|
261 |
x['ADAPMIT'] == 'Adaptation' or
|
262 |
not any(sector in [x['SECTOR1'], x['SECTOR2']] for sector in sector_classes))
|
|
|
16 |
|
17 |
# Function for creating Upload template file
|
18 |
def create_excel():
|
|
|
19 |
wb = Workbook()
|
20 |
sheet = wb.active
|
21 |
sheet.title = "template"
|
|
|
31 |
sheet.append(columns) # Appending columns to the first row
|
32 |
|
33 |
# formatting
|
34 |
+
for c in sheet['A1:I4'][0]:
|
35 |
c.fill = PatternFill('solid', fgColor = 'bad8e1')
|
36 |
c.font = Font(bold=True)
|
37 |
|
|
|
219 |
f"Estimated time remaining: {estimated_time_remaining:.1f}s"
|
220 |
f" (step {step_count+1} of {len(model_names)})"
|
221 |
)
|
|
|
222 |
else:
|
223 |
estimated_time_remaining_text.write(f'Calculating time remaining... (step {step_count+1} of {len(model_names)})')
|
224 |
|
|
|
249 |
# Create normalized leverage scale (0-1) where 300% leverage = 1
|
250 |
df['lev_maf_scale'] = df['lev_maf_%'].apply(lambda x: min(x/300, 1) if x > 0 else 0)
|
251 |
|
252 |
+
# Predict score
|
253 |
sector_classes = ['Energy','Transport','Industries']
|
254 |
df['pred_score'] = df.apply(lambda x: round((x['fin_lab2']*2 + x['scope_lab1']*2 + x['scope_lab2']*2 + x['tech_lab1'] + x['tech_lab3']+ x['lev_gt_0']+x['lev_maf_scale'])/10*10,0), axis=1)
|
255 |
+
# labelling logic
|
256 |
df['pred_action'] = df.apply(lambda x:
|
257 |
+
'INELIGIBLE' if (('concept_count' in df.columns and x['concept_count'] > 6) or
|
258 |
x['LANG'] != 'en-US' or
|
259 |
x['ADAPMIT'] == 'Adaptation' or
|
260 |
not any(sector in [x['SECTOR1'], x['SECTOR2']] for sector in sector_classes))
|