mtyrrell commited on
Commit
ab78519
·
1 Parent(s): c8a9cbc

bug fixing

Browse files
Files changed (1) hide show
  1. modules/org_count.py +125 -0
modules/org_count.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from thefuzz import fuzz
3
+
4
+
5
+ def standardize_organization_names(df):
6
+ """
7
+ Standardizes organization names in a DataFrame using exact matches, abbreviations, and fuzzy matching.
8
+
9
+ Args:
10
+ df (pd.DataFrame): DataFrame containing an 'organization' column
11
+
12
+ Returns:
13
+ pd.DataFrame: DataFrame with added 'org_renamed' and 'concept_count' columns
14
+ """
15
+ # Make a copy to avoid modifying the original DataFrame
16
+ df = df.copy()
17
+
18
+ # Dictionary of organization variations and their standardized names
19
+ org_variations = {
20
+ 'Adventist Development Relief Agency': ['adventist development'],
21
+ 'Asian Development Bank': ['asian development bank'],
22
+ 'Association of the Regional Mechanism for Emissions Reductions of Boyacá, Colombia (MRRE)': [' regional mechanism for emissions reductions of boyacá'],
23
+ 'BioCarbon Partners (BCP)': ['biocarbon partners'],
24
+ 'Biothermica Technologies Inc': ['biothermica tech'],
25
+ 'Brazilian Tourist Board': ['brazilian tourist board'],
26
+ 'Caribbean Community Climate Change Centre': ['caribbean community climate'],
27
+ 'Caritas': ['caritas'],
28
+ 'Climate Advocacy International (CAI)': ['climate advocacy int'],
29
+ 'Deutsche Gesellschaft für Internationale Zusammenarbeit (GIZ)': ['deutsche gesellschaft für internationale'],
30
+ 'Deutsche Sparkassenstiftung (DSIK)': ['deutsche sparkassenstiftung'],
31
+ 'East African Centre of Excellence for Renewable Energy and Efficiency (EACREEE)': ['east african centre of excellence for renewable'],
32
+ 'Eco-Ideal': ['eco-ideal'],
33
+ 'Global Green Growth Institute (GGGI)': ['global green growth'],
34
+ 'Inter-American Development Bank (IDB)': ['american development bank'],
35
+ 'Iskandar Regional Development Authority (IRDA)': ['iskandar regional'],
36
+ 'Islamic Development Bank': ['islamic development bank'],
37
+ 'Malaysian Industry Government Group for High Technology (MIGHT)': ['government group for high technology'],
38
+ 'Osh Technological University': ['osh technological university','ошский технологический университет'],
39
+ 'Oxford Policy Management (OPM)': ['oxford policy management'],
40
+ 'Pacific Rim Investment Management': ['pacific rim investment'],
41
+ 'Palestinian Energy and Natural Resources Authority (PENRA)': ['palestinian energy and natural'],
42
+ 'Secretariat of the Pacific Regional Environment Programme (SPREP)': ['secretariat of the pacific regional environment programme (sprep)'],
43
+ 'Serviço Nacional de Aprendizagem Industrial (SENAI)': ['serviço nacional de aprendizagem'],
44
+ 'Sumy City Council': ['sumy city council'],
45
+ 'Uganda Development Bank Limited (UDBL)': ['uganda development bank'],
46
+ 'UN-Habitat': ['united nations human settlement','un-habitat'],
47
+ 'United Nations Children\'s Fund (UNICEF)': ['united nations children'],
48
+ 'United Nations Conference on Trade and Development (UNCTAD)': ['united nations conference on trade'],
49
+ 'United Nations Development Programme (UNDP)': ['united nations development program'],
50
+ 'United Nations Economic and Social Commission (ECOSOC)': ['united nations economic and social'],
51
+ 'United Nations Environment Programme (UNEP)': ['united nations environment'],
52
+ 'United Nations Industrial Development Organization (UNIDO)': ['united nations industrial'],
53
+ 'United Nations Office for Project Services (UNOPS)': ['united nations office for project'],
54
+ 'World Food Programme (WFP)': ['world food program'],
55
+ 'World Resources Institute (WRI)': ['world resources institute'],
56
+ 'World Wide Fund for Nature (WWF)': ['world wildlife','world wide fund for nature'],
57
+ }
58
+
59
+ # Process exact matches first
60
+ df['check_name'] = None
61
+ for standard_name, variations in org_variations.items():
62
+ mask = df['organization'].str.lower().apply(lambda x: any(var in str(x) for var in variations))
63
+ df.loc[mask, 'check_name'] = standard_name
64
+
65
+ # Dictionary of organization abbreviations
66
+ org_abreviations = {
67
+ 'Deutsche Gesellschaft für Internationale Zusammenarbeit (GIZ)': ['GIZ'],
68
+ 'Deutsche Sparkassenstiftung (DSIK)': ['DSIK'],
69
+ 'East African Centre of Excellence for Renewable Energy and Efficiency (EACREEE)': ['EACREEE'],
70
+ 'Global Green Growth Institute (GGGI)': ['GGGI'],
71
+ 'UN-Habitat': ['UN-Habitat'],
72
+ 'United Nations Children\'s Fund (UNICEF)': ['UNICEF'],
73
+ 'United Nations Conference on Trade and Development (UNCTAD)': ['UNCTAD'],
74
+ 'United Nations Development Programme (UNDP)': ['UNDP'],
75
+ 'United Nations Economic and Social Commission (ECOSOC)': ['ECOSOC'],
76
+ 'United Nations Environment Programme (UNEP)': ['UNEP'],
77
+ 'United Nations Industrial Development Organization (UNIDO)': ['UNIDO'],
78
+ 'United Nations Office for Project Services (UNOPS)': ['UNOPS'],
79
+ 'World Food Programme (WFP)': ['WFP'],
80
+ 'World Resources Institute (WRI)': ['WRI'],
81
+ 'World Wide Fund for Nature (WWF)': ['WWF']
82
+ }
83
+
84
+ # Process abbreviations
85
+ df['check_abreviation'] = None
86
+ for standard_name, abreviations in org_abreviations.items():
87
+ for abreviation in abreviations:
88
+ mask = df['organization'].str.contains(abreviation, regex=False, na=False)
89
+ df.loc[mask, 'check_abreviation'] = standard_name
90
+
91
+ df['org_renamed'] = df.apply(lambda row: row['check_abreviation'] if pd.isnull(row['check_name']) else row['check_name'], axis=1)
92
+ df.drop(columns=['check_name', 'check_abreviation'], inplace=True)
93
+
94
+ # Process fuzzy matches
95
+ unmatched_mask = df['org_renamed'].isna()
96
+ threshold = 90
97
+
98
+ for idx, row in df[unmatched_mask].iterrows():
99
+ org_name = str(row['organization']).lower()
100
+ best_match = None
101
+ highest_ratio = 0
102
+
103
+ for standard_name, variations in org_variations.items():
104
+ all_forms = [standard_name.lower()] + variations
105
+ for variant in all_forms:
106
+ ratio = fuzz.ratio(org_name, variant)
107
+ if ratio > threshold and ratio > highest_ratio:
108
+ highest_ratio = ratio
109
+ best_match = standard_name
110
+
111
+ if best_match:
112
+ df.loc[idx, 'org_renamed'] = best_match
113
+
114
+ # Fill remaining empty values with original names
115
+ df.loc[df['org_renamed'].isna(), 'org_renamed'] = df.loc[df['org_renamed'].isna(), 'organization']
116
+
117
+ # Add concept count
118
+ df['concept_count'] = df.groupby('org_renamed').cumcount() + 1
119
+
120
+ return df
121
+
122
+ # Example usage:
123
+ if __name__ == "__main__":
124
+ df = pd.read_csv('sandbox/maf_orgs.csv')
125
+ df = standardize_organization_names(df)