angusfung commited on
Commit
c732f23
·
verified ·
1 Parent(s): 7e4ff82

Update src/ProcessOneSingleCampaign.py

Browse files
Files changed (1) hide show
  1. src/ProcessOneSingleCampaign.py +131 -4
src/ProcessOneSingleCampaign.py CHANGED
@@ -53,6 +53,88 @@ class CampaignProcessor:
53
  self.categories = sorted(list(set(camp.get('raw_category', '') for camp in self.data)))
54
  self.lazy_load = lazy_load
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  # Initialize model variables (to be loaded later)
57
  self.tokenizer = None # Longformer tokenizer for descriptions
58
  self.model = None # Longformer model for descriptions
@@ -365,24 +447,69 @@ class CampaignProcessor:
365
  print(f"Error processing subcategory: {str(e)}")
366
  return np.zeros(100)
367
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
368
  def process_country_embedding(self, campaign: Dict, idx: int) -> np.ndarray:
369
  """
370
  Process the project country to generate a GloVe embedding.
371
 
 
 
 
 
 
372
  Args:
373
  campaign (Dict): Campaign data
374
  idx (int): Index of the campaign
375
 
376
  Returns:
377
- np.ndarray: GloVe embedding of the country
378
  """
379
  self._ensure_models_loaded()
380
 
381
  try:
382
- country = campaign.get('raw_country', '')
383
- return self._get_glove_embedding(country)
 
 
 
 
 
 
 
384
  except Exception as e:
385
- print(f"Error processing country: {str(e)}")
386
  return np.zeros(100)
387
 
388
  def process_funding_goal(self, campaign: Dict, idx: int) -> float:
 
53
  self.categories = sorted(list(set(camp.get('raw_category', '') for camp in self.data)))
54
  self.lazy_load = lazy_load
55
 
56
+ # Country name to ISO alpha-2 code mapping
57
+ # First letter of each word is capitalized in the original data
58
+ self.country_to_alpha2 = {
59
+ # Main ISO country names
60
+ "United States": "US",
61
+ "United Kingdom": "GB",
62
+ "Canada": "CA",
63
+ "Australia": "AU",
64
+ "New Zealand": "NZ",
65
+ "Germany": "DE",
66
+ "France": "FR",
67
+ "Italy": "IT",
68
+ "Spain": "ES",
69
+ "Netherlands": "NL",
70
+ "Sweden": "SE",
71
+ "Denmark": "DK",
72
+ "Norway": "NO",
73
+ "Ireland": "IE",
74
+ "Switzerland": "CH",
75
+ "Austria": "AT",
76
+ "Belgium": "BE",
77
+ "Luxembourg": "LU",
78
+ "Hong Kong": "HK",
79
+ "Singapore": "SG",
80
+ "Mexico": "MX",
81
+ "Japan": "JP",
82
+ "China": "CN",
83
+ "Brazil": "BR",
84
+ "India": "IN",
85
+ "South Korea": "KR",
86
+ "South Africa": "ZA",
87
+ "Argentina": "AR",
88
+ "Poland": "PL",
89
+ "Portugal": "PT",
90
+ "Russia": "RU",
91
+ "Greece": "GR",
92
+ "Czech Republic": "CZ",
93
+ "Finland": "FI",
94
+ "Hungary": "HU",
95
+ "Romania": "RO",
96
+ "Thailand": "TH",
97
+ "Turkey": "TR",
98
+ "Ukraine": "UA",
99
+ "Colombia": "CO",
100
+ "Chile": "CL",
101
+ "Peru": "PE",
102
+ "Malaysia": "MY",
103
+ "Vietnam": "VN",
104
+ "Indonesia": "ID",
105
+ "Philippines": "PH",
106
+ "United Arab Emirates": "AE",
107
+ "Saudi Arabia": "SA",
108
+ "Israel": "IL",
109
+ "Egypt": "EG",
110
+ "Nigeria": "NG",
111
+ "Kenya": "KE",
112
+
113
+ # Common variants and abbreviations
114
+ "USA": "US",
115
+ "U.S.A.": "US",
116
+ "U.S.": "US",
117
+ "UK": "GB",
118
+ "U.K.": "GB",
119
+ "Great Britain": "GB",
120
+ "England": "GB",
121
+ "Republic Of Korea": "KR",
122
+ "Korea": "KR",
123
+ "Republic Of China": "CN",
124
+ "Republic Of India": "IN",
125
+ "UAE": "AE",
126
+ "Russia": "RU",
127
+ "Russian Federation": "RU",
128
+ "The Netherlands": "NL",
129
+ "Holland": "NL",
130
+ "Republic Of Ireland": "IE",
131
+ "Czech": "CZ",
132
+ "Czechia": "CZ",
133
+ }
134
+
135
+ # Create a lowercase version of the dictionary for case-insensitive lookups
136
+ self.country_to_alpha2_lower = {k.lower(): v for k, v in self.country_to_alpha2.items()}
137
+
138
  # Initialize model variables (to be loaded later)
139
  self.tokenizer = None # Longformer tokenizer for descriptions
140
  self.model = None # Longformer model for descriptions
 
447
  print(f"Error processing subcategory: {str(e)}")
448
  return np.zeros(100)
449
 
450
+ def _convert_country_to_alpha2(self, country_name: str) -> str:
451
+ """
452
+ Convert a country name to its ISO alpha-2 code.
453
+
454
+ This helper method handles the conversion with proper logging:
455
+ 1. Tries exact match first
456
+ 2. Falls back to case-insensitive match
457
+ 3. Returns original string if no match found
458
+
459
+ Args:
460
+ country_name (str): Country name to convert
461
+
462
+ Returns:
463
+ str: ISO alpha-2 code (e.g., "US") or original country name if no match
464
+ """
465
+ if not country_name:
466
+ return ""
467
+
468
+ # Try exact match first
469
+ alpha2_code = self.country_to_alpha2.get(country_name)
470
+
471
+ # If no exact match, try case-insensitive match
472
+ if not alpha2_code:
473
+ alpha2_code = self.country_to_alpha2_lower.get(country_name.lower())
474
+
475
+ # Log results
476
+ if alpha2_code:
477
+ print(f"Country conversion: '{country_name}' → '{alpha2_code}'")
478
+ return alpha2_code
479
+ else:
480
+ print(f"Country conversion failed: '{country_name}' not found in dictionary")
481
+ return country_name
482
+
483
  def process_country_embedding(self, campaign: Dict, idx: int) -> np.ndarray:
484
  """
485
  Process the project country to generate a GloVe embedding.
486
 
487
+ This method:
488
+ 1. Extracts the country name from campaign data
489
+ 2. Converts full country name to ISO alpha-2 code (e.g., "United States" → "US")
490
+ 3. Generates an embedding using GloVe for the standardized country code
491
+
492
  Args:
493
  campaign (Dict): Campaign data
494
  idx (int): Index of the campaign
495
 
496
  Returns:
497
+ np.ndarray: GloVe embedding of the country (as alpha-2 code)
498
  """
499
  self._ensure_models_loaded()
500
 
501
  try:
502
+ # Extract country name from campaign data
503
+ country_name = campaign.get('raw_country', '')
504
+
505
+ # Convert to alpha-2 code using helper method
506
+ alpha2_code = self._convert_country_to_alpha2(country_name)
507
+
508
+ # Generate embedding using standardized country code
509
+ return self._get_glove_embedding(alpha2_code)
510
+
511
  except Exception as e:
512
+ print(f"Error processing country for campaign {idx}: {str(e)}")
513
  return np.zeros(100)
514
 
515
  def process_funding_goal(self, campaign: Dict, idx: int) -> float: