Update src/ProcessOneSingleCampaign.py
Browse files- src/ProcessOneSingleCampaign.py +131 -4
src/ProcessOneSingleCampaign.py
CHANGED
@@ -53,6 +53,88 @@ class CampaignProcessor:
|
|
53 |
self.categories = sorted(list(set(camp.get('raw_category', '') for camp in self.data)))
|
54 |
self.lazy_load = lazy_load
|
55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
# Initialize model variables (to be loaded later)
|
57 |
self.tokenizer = None # Longformer tokenizer for descriptions
|
58 |
self.model = None # Longformer model for descriptions
|
@@ -365,24 +447,69 @@ class CampaignProcessor:
|
|
365 |
print(f"Error processing subcategory: {str(e)}")
|
366 |
return np.zeros(100)
|
367 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
368 |
def process_country_embedding(self, campaign: Dict, idx: int) -> np.ndarray:
|
369 |
"""
|
370 |
Process the project country to generate a GloVe embedding.
|
371 |
|
|
|
|
|
|
|
|
|
|
|
372 |
Args:
|
373 |
campaign (Dict): Campaign data
|
374 |
idx (int): Index of the campaign
|
375 |
|
376 |
Returns:
|
377 |
-
np.ndarray: GloVe embedding of the country
|
378 |
"""
|
379 |
self._ensure_models_loaded()
|
380 |
|
381 |
try:
|
382 |
-
country
|
383 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
384 |
except Exception as e:
|
385 |
-
print(f"Error processing country: {str(e)}")
|
386 |
return np.zeros(100)
|
387 |
|
388 |
def process_funding_goal(self, campaign: Dict, idx: int) -> float:
|
|
|
53 |
self.categories = sorted(list(set(camp.get('raw_category', '') for camp in self.data)))
|
54 |
self.lazy_load = lazy_load
|
55 |
|
56 |
+
# Country name to ISO alpha-2 code mapping
|
57 |
+
# First letter of each word is capitalized in the original data
|
58 |
+
self.country_to_alpha2 = {
|
59 |
+
# Main ISO country names
|
60 |
+
"United States": "US",
|
61 |
+
"United Kingdom": "GB",
|
62 |
+
"Canada": "CA",
|
63 |
+
"Australia": "AU",
|
64 |
+
"New Zealand": "NZ",
|
65 |
+
"Germany": "DE",
|
66 |
+
"France": "FR",
|
67 |
+
"Italy": "IT",
|
68 |
+
"Spain": "ES",
|
69 |
+
"Netherlands": "NL",
|
70 |
+
"Sweden": "SE",
|
71 |
+
"Denmark": "DK",
|
72 |
+
"Norway": "NO",
|
73 |
+
"Ireland": "IE",
|
74 |
+
"Switzerland": "CH",
|
75 |
+
"Austria": "AT",
|
76 |
+
"Belgium": "BE",
|
77 |
+
"Luxembourg": "LU",
|
78 |
+
"Hong Kong": "HK",
|
79 |
+
"Singapore": "SG",
|
80 |
+
"Mexico": "MX",
|
81 |
+
"Japan": "JP",
|
82 |
+
"China": "CN",
|
83 |
+
"Brazil": "BR",
|
84 |
+
"India": "IN",
|
85 |
+
"South Korea": "KR",
|
86 |
+
"South Africa": "ZA",
|
87 |
+
"Argentina": "AR",
|
88 |
+
"Poland": "PL",
|
89 |
+
"Portugal": "PT",
|
90 |
+
"Russia": "RU",
|
91 |
+
"Greece": "GR",
|
92 |
+
"Czech Republic": "CZ",
|
93 |
+
"Finland": "FI",
|
94 |
+
"Hungary": "HU",
|
95 |
+
"Romania": "RO",
|
96 |
+
"Thailand": "TH",
|
97 |
+
"Turkey": "TR",
|
98 |
+
"Ukraine": "UA",
|
99 |
+
"Colombia": "CO",
|
100 |
+
"Chile": "CL",
|
101 |
+
"Peru": "PE",
|
102 |
+
"Malaysia": "MY",
|
103 |
+
"Vietnam": "VN",
|
104 |
+
"Indonesia": "ID",
|
105 |
+
"Philippines": "PH",
|
106 |
+
"United Arab Emirates": "AE",
|
107 |
+
"Saudi Arabia": "SA",
|
108 |
+
"Israel": "IL",
|
109 |
+
"Egypt": "EG",
|
110 |
+
"Nigeria": "NG",
|
111 |
+
"Kenya": "KE",
|
112 |
+
|
113 |
+
# Common variants and abbreviations
|
114 |
+
"USA": "US",
|
115 |
+
"U.S.A.": "US",
|
116 |
+
"U.S.": "US",
|
117 |
+
"UK": "GB",
|
118 |
+
"U.K.": "GB",
|
119 |
+
"Great Britain": "GB",
|
120 |
+
"England": "GB",
|
121 |
+
"Republic Of Korea": "KR",
|
122 |
+
"Korea": "KR",
|
123 |
+
"Republic Of China": "CN",
|
124 |
+
"Republic Of India": "IN",
|
125 |
+
"UAE": "AE",
|
126 |
+
"Russia": "RU",
|
127 |
+
"Russian Federation": "RU",
|
128 |
+
"The Netherlands": "NL",
|
129 |
+
"Holland": "NL",
|
130 |
+
"Republic Of Ireland": "IE",
|
131 |
+
"Czech": "CZ",
|
132 |
+
"Czechia": "CZ",
|
133 |
+
}
|
134 |
+
|
135 |
+
# Create a lowercase version of the dictionary for case-insensitive lookups
|
136 |
+
self.country_to_alpha2_lower = {k.lower(): v for k, v in self.country_to_alpha2.items()}
|
137 |
+
|
138 |
# Initialize model variables (to be loaded later)
|
139 |
self.tokenizer = None # Longformer tokenizer for descriptions
|
140 |
self.model = None # Longformer model for descriptions
|
|
|
447 |
print(f"Error processing subcategory: {str(e)}")
|
448 |
return np.zeros(100)
|
449 |
|
450 |
+
def _convert_country_to_alpha2(self, country_name: str) -> str:
|
451 |
+
"""
|
452 |
+
Convert a country name to its ISO alpha-2 code.
|
453 |
+
|
454 |
+
This helper method handles the conversion with proper logging:
|
455 |
+
1. Tries exact match first
|
456 |
+
2. Falls back to case-insensitive match
|
457 |
+
3. Returns original string if no match found
|
458 |
+
|
459 |
+
Args:
|
460 |
+
country_name (str): Country name to convert
|
461 |
+
|
462 |
+
Returns:
|
463 |
+
str: ISO alpha-2 code (e.g., "US") or original country name if no match
|
464 |
+
"""
|
465 |
+
if not country_name:
|
466 |
+
return ""
|
467 |
+
|
468 |
+
# Try exact match first
|
469 |
+
alpha2_code = self.country_to_alpha2.get(country_name)
|
470 |
+
|
471 |
+
# If no exact match, try case-insensitive match
|
472 |
+
if not alpha2_code:
|
473 |
+
alpha2_code = self.country_to_alpha2_lower.get(country_name.lower())
|
474 |
+
|
475 |
+
# Log results
|
476 |
+
if alpha2_code:
|
477 |
+
print(f"Country conversion: '{country_name}' → '{alpha2_code}'")
|
478 |
+
return alpha2_code
|
479 |
+
else:
|
480 |
+
print(f"Country conversion failed: '{country_name}' not found in dictionary")
|
481 |
+
return country_name
|
482 |
+
|
483 |
def process_country_embedding(self, campaign: Dict, idx: int) -> np.ndarray:
|
484 |
"""
|
485 |
Process the project country to generate a GloVe embedding.
|
486 |
|
487 |
+
This method:
|
488 |
+
1. Extracts the country name from campaign data
|
489 |
+
2. Converts full country name to ISO alpha-2 code (e.g., "United States" → "US")
|
490 |
+
3. Generates an embedding using GloVe for the standardized country code
|
491 |
+
|
492 |
Args:
|
493 |
campaign (Dict): Campaign data
|
494 |
idx (int): Index of the campaign
|
495 |
|
496 |
Returns:
|
497 |
+
np.ndarray: GloVe embedding of the country (as alpha-2 code)
|
498 |
"""
|
499 |
self._ensure_models_loaded()
|
500 |
|
501 |
try:
|
502 |
+
# Extract country name from campaign data
|
503 |
+
country_name = campaign.get('raw_country', '')
|
504 |
+
|
505 |
+
# Convert to alpha-2 code using helper method
|
506 |
+
alpha2_code = self._convert_country_to_alpha2(country_name)
|
507 |
+
|
508 |
+
# Generate embedding using standardized country code
|
509 |
+
return self._get_glove_embedding(alpha2_code)
|
510 |
+
|
511 |
except Exception as e:
|
512 |
+
print(f"Error processing country for campaign {idx}: {str(e)}")
|
513 |
return np.zeros(100)
|
514 |
|
515 |
def process_funding_goal(self, campaign: Dict, idx: int) -> float:
|