Spaces:
Running
Running
bug fixing
Browse files- modules/org_count.py +125 -0
modules/org_count.py
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from thefuzz import fuzz
|
3 |
+
|
4 |
+
|
5 |
+
def standardize_organization_names(df):
|
6 |
+
"""
|
7 |
+
Standardizes organization names in a DataFrame using exact matches, abbreviations, and fuzzy matching.
|
8 |
+
|
9 |
+
Args:
|
10 |
+
df (pd.DataFrame): DataFrame containing an 'organization' column
|
11 |
+
|
12 |
+
Returns:
|
13 |
+
pd.DataFrame: DataFrame with added 'org_renamed' and 'concept_count' columns
|
14 |
+
"""
|
15 |
+
# Make a copy to avoid modifying the original DataFrame
|
16 |
+
df = df.copy()
|
17 |
+
|
18 |
+
# Dictionary of organization variations and their standardized names
|
19 |
+
org_variations = {
|
20 |
+
'Adventist Development Relief Agency': ['adventist development'],
|
21 |
+
'Asian Development Bank': ['asian development bank'],
|
22 |
+
'Association of the Regional Mechanism for Emissions Reductions of Boyacá, Colombia (MRRE)': [' regional mechanism for emissions reductions of boyacá'],
|
23 |
+
'BioCarbon Partners (BCP)': ['biocarbon partners'],
|
24 |
+
'Biothermica Technologies Inc': ['biothermica tech'],
|
25 |
+
'Brazilian Tourist Board': ['brazilian tourist board'],
|
26 |
+
'Caribbean Community Climate Change Centre': ['caribbean community climate'],
|
27 |
+
'Caritas': ['caritas'],
|
28 |
+
'Climate Advocacy International (CAI)': ['climate advocacy int'],
|
29 |
+
'Deutsche Gesellschaft für Internationale Zusammenarbeit (GIZ)': ['deutsche gesellschaft für internationale'],
|
30 |
+
'Deutsche Sparkassenstiftung (DSIK)': ['deutsche sparkassenstiftung'],
|
31 |
+
'East African Centre of Excellence for Renewable Energy and Efficiency (EACREEE)': ['east african centre of excellence for renewable'],
|
32 |
+
'Eco-Ideal': ['eco-ideal'],
|
33 |
+
'Global Green Growth Institute (GGGI)': ['global green growth'],
|
34 |
+
'Inter-American Development Bank (IDB)': ['american development bank'],
|
35 |
+
'Iskandar Regional Development Authority (IRDA)': ['iskandar regional'],
|
36 |
+
'Islamic Development Bank': ['islamic development bank'],
|
37 |
+
'Malaysian Industry Government Group for High Technology (MIGHT)': ['government group for high technology'],
|
38 |
+
'Osh Technological University': ['osh technological university','ошский технологический университет'],
|
39 |
+
'Oxford Policy Management (OPM)': ['oxford policy management'],
|
40 |
+
'Pacific Rim Investment Management': ['pacific rim investment'],
|
41 |
+
'Palestinian Energy and Natural Resources Authority (PENRA)': ['palestinian energy and natural'],
|
42 |
+
'Secretariat of the Pacific Regional Environment Programme (SPREP)': ['secretariat of the pacific regional environment programme (sprep)'],
|
43 |
+
'Serviço Nacional de Aprendizagem Industrial (SENAI)': ['serviço nacional de aprendizagem'],
|
44 |
+
'Sumy City Council': ['sumy city council'],
|
45 |
+
'Uganda Development Bank Limited (UDBL)': ['uganda development bank'],
|
46 |
+
'UN-Habitat': ['united nations human settlement','un-habitat'],
|
47 |
+
'United Nations Children\'s Fund (UNICEF)': ['united nations children'],
|
48 |
+
'United Nations Conference on Trade and Development (UNCTAD)': ['united nations conference on trade'],
|
49 |
+
'United Nations Development Programme (UNDP)': ['united nations development program'],
|
50 |
+
'United Nations Economic and Social Commission (ECOSOC)': ['united nations economic and social'],
|
51 |
+
'United Nations Environment Programme (UNEP)': ['united nations environment'],
|
52 |
+
'United Nations Industrial Development Organization (UNIDO)': ['united nations industrial'],
|
53 |
+
'United Nations Office for Project Services (UNOPS)': ['united nations office for project'],
|
54 |
+
'World Food Programme (WFP)': ['world food program'],
|
55 |
+
'World Resources Institute (WRI)': ['world resources institute'],
|
56 |
+
'World Wide Fund for Nature (WWF)': ['world wildlife','world wide fund for nature'],
|
57 |
+
}
|
58 |
+
|
59 |
+
# Process exact matches first
|
60 |
+
df['check_name'] = None
|
61 |
+
for standard_name, variations in org_variations.items():
|
62 |
+
mask = df['organization'].str.lower().apply(lambda x: any(var in str(x) for var in variations))
|
63 |
+
df.loc[mask, 'check_name'] = standard_name
|
64 |
+
|
65 |
+
# Dictionary of organization abbreviations
|
66 |
+
org_abreviations = {
|
67 |
+
'Deutsche Gesellschaft für Internationale Zusammenarbeit (GIZ)': ['GIZ'],
|
68 |
+
'Deutsche Sparkassenstiftung (DSIK)': ['DSIK'],
|
69 |
+
'East African Centre of Excellence for Renewable Energy and Efficiency (EACREEE)': ['EACREEE'],
|
70 |
+
'Global Green Growth Institute (GGGI)': ['GGGI'],
|
71 |
+
'UN-Habitat': ['UN-Habitat'],
|
72 |
+
'United Nations Children\'s Fund (UNICEF)': ['UNICEF'],
|
73 |
+
'United Nations Conference on Trade and Development (UNCTAD)': ['UNCTAD'],
|
74 |
+
'United Nations Development Programme (UNDP)': ['UNDP'],
|
75 |
+
'United Nations Economic and Social Commission (ECOSOC)': ['ECOSOC'],
|
76 |
+
'United Nations Environment Programme (UNEP)': ['UNEP'],
|
77 |
+
'United Nations Industrial Development Organization (UNIDO)': ['UNIDO'],
|
78 |
+
'United Nations Office for Project Services (UNOPS)': ['UNOPS'],
|
79 |
+
'World Food Programme (WFP)': ['WFP'],
|
80 |
+
'World Resources Institute (WRI)': ['WRI'],
|
81 |
+
'World Wide Fund for Nature (WWF)': ['WWF']
|
82 |
+
}
|
83 |
+
|
84 |
+
# Process abbreviations
|
85 |
+
df['check_abreviation'] = None
|
86 |
+
for standard_name, abreviations in org_abreviations.items():
|
87 |
+
for abreviation in abreviations:
|
88 |
+
mask = df['organization'].str.contains(abreviation, regex=False, na=False)
|
89 |
+
df.loc[mask, 'check_abreviation'] = standard_name
|
90 |
+
|
91 |
+
df['org_renamed'] = df.apply(lambda row: row['check_abreviation'] if pd.isnull(row['check_name']) else row['check_name'], axis=1)
|
92 |
+
df.drop(columns=['check_name', 'check_abreviation'], inplace=True)
|
93 |
+
|
94 |
+
# Process fuzzy matches
|
95 |
+
unmatched_mask = df['org_renamed'].isna()
|
96 |
+
threshold = 90
|
97 |
+
|
98 |
+
for idx, row in df[unmatched_mask].iterrows():
|
99 |
+
org_name = str(row['organization']).lower()
|
100 |
+
best_match = None
|
101 |
+
highest_ratio = 0
|
102 |
+
|
103 |
+
for standard_name, variations in org_variations.items():
|
104 |
+
all_forms = [standard_name.lower()] + variations
|
105 |
+
for variant in all_forms:
|
106 |
+
ratio = fuzz.ratio(org_name, variant)
|
107 |
+
if ratio > threshold and ratio > highest_ratio:
|
108 |
+
highest_ratio = ratio
|
109 |
+
best_match = standard_name
|
110 |
+
|
111 |
+
if best_match:
|
112 |
+
df.loc[idx, 'org_renamed'] = best_match
|
113 |
+
|
114 |
+
# Fill remaining empty values with original names
|
115 |
+
df.loc[df['org_renamed'].isna(), 'org_renamed'] = df.loc[df['org_renamed'].isna(), 'organization']
|
116 |
+
|
117 |
+
# Add concept count
|
118 |
+
df['concept_count'] = df.groupby('org_renamed').cumcount() + 1
|
119 |
+
|
120 |
+
return df
|
121 |
+
|
122 |
+
# Example usage:
|
123 |
+
if __name__ == "__main__":
|
124 |
+
df = pd.read_csv('sandbox/maf_orgs.csv')
|
125 |
+
df = standardize_organization_names(df)
|