angusfung commited on
Commit
7e4ff82
·
verified ·
1 Parent(s): a84b543

updated numerical feature engineering

Browse files
Files changed (1) hide show
  1. src/ProcessOneSingleCampaign.py +289 -41
src/ProcessOneSingleCampaign.py CHANGED
@@ -1,3 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  # Set gensim data directory to a writable location at the very start
3
  os.environ['GENSIM_DATA_DIR'] = '/tmp/gensim-data'
@@ -9,29 +25,57 @@ except Exception as e:
9
 
10
  import json
11
  import numpy as np
12
- from typing import Dict
13
  import torch
14
  from transformers import AutoTokenizer, AutoModel
15
  import gc
16
  import gensim.downloader
17
 
18
  class CampaignProcessor:
19
- def __init__(self, data, lazy_load=False):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  self.data = data
21
  self.categories = sorted(list(set(camp.get('raw_category', '') for camp in self.data)))
22
  self.lazy_load = lazy_load
23
 
24
- self.tokenizer = None
25
- self.model = None
26
- self.RiskandBlurb_tokenizer = None
27
- self.RiskandBlurb_model = None
28
- self.glove = None
 
29
  self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
30
 
 
31
  if not lazy_load:
32
  self._load_models()
33
 
34
  def _load_models(self):
 
 
 
 
 
 
 
 
 
 
35
  print("Loading NLP models...")
36
  # Cache models locally to avoid downloading every time
37
  cache_dir = "/tmp/model_cache"
@@ -120,33 +164,60 @@ class CampaignProcessor:
120
  raise e
121
 
122
  def _ensure_models_loaded(self):
 
 
 
 
 
 
123
  if self.model is None or self.tokenizer is None or self.RiskandBlurb_model is None or self.RiskandBlurb_tokenizer is None or self.glove is None:
124
  self._load_models()
125
 
126
- def _process_text_embedding(self, text, max_length, tokenizer, model):
127
- # Common function for text embedding generation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  if self.device.type == 'cuda':
129
  torch.cuda.empty_cache()
130
  gc.collect()
131
 
 
132
  inputs = tokenizer(text,
133
  padding=True,
134
  truncation=True,
135
  max_length=max_length,
136
  return_tensors="pt")
137
 
 
138
  inputs = {k: v.to(self.device) for k, v in inputs.items()}
139
 
 
140
  with torch.no_grad():
141
  outputs = model(**inputs)
142
 
 
143
  attention_mask = inputs['attention_mask']
144
  token_embeddings = outputs.last_hidden_state
145
  input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
146
  sentence_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
147
 
 
148
  embedding = sentence_embeddings.cpu().numpy()
149
 
 
150
  del inputs, outputs, token_embeddings, sentence_embeddings
151
  if self.device.type == 'cuda':
152
  torch.cuda.empty_cache()
@@ -154,8 +225,17 @@ class CampaignProcessor:
154
 
155
  return embedding[0]
156
 
157
- def _get_glove_embedding(self, text, dim=100):
158
- # Common function for GloVe embeddings (subcategory and country)
 
 
 
 
 
 
 
 
 
159
  if not text:
160
  return np.zeros(dim)
161
 
@@ -164,16 +244,30 @@ class CampaignProcessor:
164
  words = text.split()
165
  vectors = []
166
 
 
167
  for word in words:
168
  if word in self.glove:
169
  vectors.append(self.glove[word])
170
 
 
171
  if vectors:
172
  return np.mean(vectors, axis=0)
173
  else:
174
  return np.zeros(dim)
175
 
176
- def process_description_embedding(self, campaign: Dict, idx: int):
 
 
 
 
 
 
 
 
 
 
 
 
177
  self._ensure_models_loaded()
178
 
179
  try:
@@ -185,7 +279,17 @@ class CampaignProcessor:
185
  print(f"Error processing description: {str(e)}")
186
  return np.zeros(768), 0
187
 
188
- def process_riskandchallenges_embedding(self, campaign: Dict, idx: int):
 
 
 
 
 
 
 
 
 
 
189
  self._ensure_models_loaded()
190
 
191
  try:
@@ -195,7 +299,17 @@ class CampaignProcessor:
195
  print(f"Error processing risk statement: {str(e)}")
196
  return np.zeros(384)
197
 
198
- def process_blurb(self, campaign: Dict, idx: int):
 
 
 
 
 
 
 
 
 
 
199
  self._ensure_models_loaded()
200
 
201
  try:
@@ -205,7 +319,16 @@ class CampaignProcessor:
205
  print(f"Error processing blurb: {str(e)}")
206
  return np.zeros(384)
207
 
208
- def process_category(self, campaign: Dict):
 
 
 
 
 
 
 
 
 
209
  try:
210
  # All categories in the dataset
211
  fixed_categories = [
@@ -222,7 +345,17 @@ class CampaignProcessor:
222
  print(f"Error processing category: {str(e)}")
223
  return [0] * 15
224
 
225
- def process_subcategory_embedding(self, campaign: Dict, idx: int):
 
 
 
 
 
 
 
 
 
 
226
  self._ensure_models_loaded()
227
 
228
  try:
@@ -232,7 +365,17 @@ class CampaignProcessor:
232
  print(f"Error processing subcategory: {str(e)}")
233
  return np.zeros(100)
234
 
235
- def process_country_embedding(self, campaign: Dict, idx: int):
 
 
 
 
 
 
 
 
 
 
236
  self._ensure_models_loaded()
237
 
238
  try:
@@ -242,19 +385,131 @@ class CampaignProcessor:
242
  print(f"Error processing country: {str(e)}")
243
  return np.zeros(100)
244
 
245
- def process_funding_goal(self, campaign: Dict, idx: int):
246
- return float(campaign.get('funding_goal', 0))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
 
248
- def process_previous_funding_goal(self, campaign: Dict, idx: int):
249
- return float(campaign.get('previous_funding_goal', 0))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
 
251
- def process_previous_pledged(self, campaign: Dict, idx: int):
252
- return float(campaign.get('previous_pledged', 0))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
 
254
- def calculate_previous_sucess_rate(self, campaign: Dict, idx: int):
255
- return float(campaign.get('previous_success_rate', 0))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
- def process_campaign(self, campaign: Dict, idx: int):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
  self._ensure_models_loaded()
259
 
260
  # Generate embeddings for text fields
@@ -274,22 +529,15 @@ class CampaignProcessor:
274
  'country_embedding': self.process_country_embedding(campaign, idx).tolist()
275
  }
276
 
277
- # Process numerical features or use existing values from input
278
- numerical_fields = [
279
- ('funding_goal', self.process_funding_goal),
280
- ('previous_funding_goal', self.process_previous_funding_goal),
281
- ('previous_pledged', self.process_previous_pledged),
282
- ('previous_success_rate', self.calculate_previous_sucess_rate)
283
- ]
284
-
285
- # Process numerical features or use values from input
286
- for field_name, processor_func in numerical_fields:
287
- if field_name in campaign:
288
- result[field_name] = campaign[field_name]
289
- else:
290
- result[field_name] = processor_func(campaign, idx)
291
 
292
- # Simple integer fields
293
  for field in ['image_count', 'video_count', 'campaign_duration', 'previous_projects_count']:
294
  result[field] = int(campaign.get(field, 0))
295
 
 
1
+ """
2
+ Campaign Data Processor for Kickstarter Prediction
3
+
4
+ This module handles the preprocessing of raw Kickstarter campaign data,
5
+ generating text embeddings and preparing numerical features for prediction.
6
+
7
+ Key functionality:
8
+ - Longformer embeddings for project descriptions
9
+ - Sentence transformer embeddings for blurbs and risk statements
10
+ - GloVe embeddings for categories and countries
11
+ - Normalization of numerical features
12
+
13
+ Author: Angus Fung
14
+ Date: April 2025
15
+ """
16
+
17
  import os
18
  # Set gensim data directory to a writable location at the very start
19
  os.environ['GENSIM_DATA_DIR'] = '/tmp/gensim-data'
 
25
 
26
  import json
27
  import numpy as np
28
+ from typing import Dict, List, Tuple, Any, Optional
29
  import torch
30
  from transformers import AutoTokenizer, AutoModel
31
  import gc
32
  import gensim.downloader
33
 
34
  class CampaignProcessor:
35
+ """
36
+ Processor for Kickstarter campaign data.
37
+
38
+ This class handles the preprocessing of raw campaign data, transforming
39
+ text and categorical features into embeddings using various NLP models
40
+ and preparing numerical features for the prediction model.
41
+ """
42
+
43
+ def __init__(self, data: List[Dict], lazy_load: bool = False):
44
+ """
45
+ Initialize the CampaignProcessor.
46
+
47
+ Args:
48
+ data (List[Dict]): List of campaign dictionaries to process
49
+ lazy_load (bool): If True, models will be loaded only when needed
50
+ rather than at initialization time
51
+ """
52
  self.data = data
53
  self.categories = sorted(list(set(camp.get('raw_category', '') for camp in self.data)))
54
  self.lazy_load = lazy_load
55
 
56
+ # Initialize model variables (to be loaded later)
57
+ self.tokenizer = None # Longformer tokenizer for descriptions
58
+ self.model = None # Longformer model for descriptions
59
+ self.RiskandBlurb_tokenizer = None # MiniLM tokenizer for blurb and risk
60
+ self.RiskandBlurb_model = None # MiniLM model for blurb and risk
61
+ self.glove = None # GloVe word vectors for categories and countries
62
  self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
63
 
64
+ # Load models at initialization if not using lazy loading
65
  if not lazy_load:
66
  self._load_models()
67
 
68
  def _load_models(self):
69
+ """
70
+ Load all NLP models required for processing campaign data.
71
+
72
+ This method loads:
73
+ - Longformer for description embeddings
74
+ - MiniLM for blurb and risk embeddings
75
+ - GloVe for category and country embeddings
76
+
77
+ Models are cached to avoid reloading and moved to the appropriate device.
78
+ """
79
  print("Loading NLP models...")
80
  # Cache models locally to avoid downloading every time
81
  cache_dir = "/tmp/model_cache"
 
164
  raise e
165
 
166
  def _ensure_models_loaded(self):
167
+ """
168
+ Ensure all required models are loaded.
169
+
170
+ This is called before any processing to make sure models are ready,
171
+ particularly important when using lazy loading.
172
+ """
173
  if self.model is None or self.tokenizer is None or self.RiskandBlurb_model is None or self.RiskandBlurb_tokenizer is None or self.glove is None:
174
  self._load_models()
175
 
176
+ def _process_text_embedding(self, text: str, max_length: int, tokenizer: AutoTokenizer, model: AutoModel) -> np.ndarray:
177
+ """
178
+ Generate embedding for text using the specified model and tokenizer.
179
+
180
+ This method handles tokenization, model inference, and pooling to
181
+ create a single vector representation of the input text.
182
+
183
+ Args:
184
+ text (str): Text to embed
185
+ max_length (int): Maximum token length for the model
186
+ tokenizer (AutoTokenizer): Tokenizer to use
187
+ model (AutoModel): Model to use for embedding generation
188
+
189
+ Returns:
190
+ np.ndarray: Embedding vector for the text
191
+ """
192
+ # Clean up memory before processing
193
  if self.device.type == 'cuda':
194
  torch.cuda.empty_cache()
195
  gc.collect()
196
 
197
+ # Tokenize the text
198
  inputs = tokenizer(text,
199
  padding=True,
200
  truncation=True,
201
  max_length=max_length,
202
  return_tensors="pt")
203
 
204
+ # Move inputs to device
205
  inputs = {k: v.to(self.device) for k, v in inputs.items()}
206
 
207
+ # Generate embeddings
208
  with torch.no_grad():
209
  outputs = model(**inputs)
210
 
211
+ # Mean pooling - take average of all token embeddings
212
  attention_mask = inputs['attention_mask']
213
  token_embeddings = outputs.last_hidden_state
214
  input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
215
  sentence_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
216
 
217
+ # Convert to numpy array
218
  embedding = sentence_embeddings.cpu().numpy()
219
 
220
+ # Clean up to prevent memory leaks
221
  del inputs, outputs, token_embeddings, sentence_embeddings
222
  if self.device.type == 'cuda':
223
  torch.cuda.empty_cache()
 
225
 
226
  return embedding[0]
227
 
228
+ def _get_glove_embedding(self, text: str, dim: int = 100) -> np.ndarray:
229
+ """
230
+ Generate GloVe embedding for a text by averaging word vectors.
231
+
232
+ Args:
233
+ text (str): Text to embed
234
+ dim (int): Dimension of the GloVe embeddings
235
+
236
+ Returns:
237
+ np.ndarray: Averaged GloVe embedding for the text
238
+ """
239
  if not text:
240
  return np.zeros(dim)
241
 
 
244
  words = text.split()
245
  vectors = []
246
 
247
+ # Collect vectors for words that exist in the vocabulary
248
  for word in words:
249
  if word in self.glove:
250
  vectors.append(self.glove[word])
251
 
252
+ # Average vectors if any exist, otherwise return zeros
253
  if vectors:
254
  return np.mean(vectors, axis=0)
255
  else:
256
  return np.zeros(dim)
257
 
258
+ def process_description_embedding(self, campaign: Dict, idx: int) -> Tuple[np.ndarray, int]:
259
+ """
260
+ Process the project description to generate a Longformer embedding.
261
+
262
+ Args:
263
+ campaign (Dict): Campaign data
264
+ idx (int): Index of the campaign
265
+
266
+ Returns:
267
+ Tuple containing:
268
+ - np.ndarray: Longformer embedding of the description
269
+ - int: Word count of the description
270
+ """
271
  self._ensure_models_loaded()
272
 
273
  try:
 
279
  print(f"Error processing description: {str(e)}")
280
  return np.zeros(768), 0
281
 
282
+ def process_riskandchallenges_embedding(self, campaign: Dict, idx: int) -> np.ndarray:
283
+ """
284
+ Process the risks and challenges section to generate a MiniLM embedding.
285
+
286
+ Args:
287
+ campaign (Dict): Campaign data
288
+ idx (int): Index of the campaign
289
+
290
+ Returns:
291
+ np.ndarray: MiniLM embedding of the risks section
292
+ """
293
  self._ensure_models_loaded()
294
 
295
  try:
 
299
  print(f"Error processing risk statement: {str(e)}")
300
  return np.zeros(384)
301
 
302
+ def process_blurb(self, campaign: Dict, idx: int) -> np.ndarray:
303
+ """
304
+ Process the project blurb to generate a MiniLM embedding.
305
+
306
+ Args:
307
+ campaign (Dict): Campaign data
308
+ idx (int): Index of the campaign
309
+
310
+ Returns:
311
+ np.ndarray: MiniLM embedding of the blurb
312
+ """
313
  self._ensure_models_loaded()
314
 
315
  try:
 
319
  print(f"Error processing blurb: {str(e)}")
320
  return np.zeros(384)
321
 
322
+ def process_category(self, campaign: Dict) -> List[int]:
323
+ """
324
+ Process the project category into a one-hot encoding.
325
+
326
+ Args:
327
+ campaign (Dict): Campaign data
328
+
329
+ Returns:
330
+ List[int]: One-hot encoding of the category
331
+ """
332
  try:
333
  # All categories in the dataset
334
  fixed_categories = [
 
345
  print(f"Error processing category: {str(e)}")
346
  return [0] * 15
347
 
348
+ def process_subcategory_embedding(self, campaign: Dict, idx: int) -> np.ndarray:
349
+ """
350
+ Process the project subcategory to generate a GloVe embedding.
351
+
352
+ Args:
353
+ campaign (Dict): Campaign data
354
+ idx (int): Index of the campaign
355
+
356
+ Returns:
357
+ np.ndarray: GloVe embedding of the subcategory
358
+ """
359
  self._ensure_models_loaded()
360
 
361
  try:
 
365
  print(f"Error processing subcategory: {str(e)}")
366
  return np.zeros(100)
367
 
368
+ def process_country_embedding(self, campaign: Dict, idx: int) -> np.ndarray:
369
+ """
370
+ Process the project country to generate a GloVe embedding.
371
+
372
+ Args:
373
+ campaign (Dict): Campaign data
374
+ idx (int): Index of the campaign
375
+
376
+ Returns:
377
+ np.ndarray: GloVe embedding of the country
378
+ """
379
  self._ensure_models_loaded()
380
 
381
  try:
 
385
  print(f"Error processing country: {str(e)}")
386
  return np.zeros(100)
387
 
388
+ def process_funding_goal(self, campaign: Dict, idx: int) -> float:
389
+ """
390
+ Process campaign funding goal with logarithmic compression.
391
+
392
+ Applies Log1p transformation with base 10 to compress extreme values while
393
+ preserving relative differences between funding goals.
394
+
395
+ Args:
396
+ campaign (Dict): Campaign data
397
+ idx (int): Index of the campaign
398
+
399
+ Returns:
400
+ float: The transformed funding goal
401
+ """
402
+ try:
403
+ goal = float(campaign.get('funding_goal', 0))
404
+
405
+ # Log1p transformation, good for general compression while preserving relative differences
406
+ transformed_goal = np.log1p(goal)/np.log(10)
407
+
408
+ return transformed_goal
409
+
410
+ except Exception as e:
411
+ print(f"Error processing funding goal for campaign {idx}: {str(e)}")
412
+ return 0.0
413
 
414
+ def process_previous_funding_goal(self, campaign: Dict, idx: int) -> float:
415
+ """
416
+ Process previous campaign funding goal with logarithmic compression.
417
+
418
+ Applies Log1p transformation with base 10 to compress extreme values while
419
+ preserving relative differences between previous funding goals.
420
+
421
+ Args:
422
+ campaign (Dict): Campaign data
423
+ idx (int): Index of the campaign
424
+
425
+ Returns:
426
+ float: The transformed previous funding goal
427
+ """
428
+ try:
429
+ previous_goal = float(campaign.get('previous_funding_goal', 0))
430
+
431
+ # Log1p transformation, good for general compression while preserving relative differences
432
+ transformed_goal = np.log1p(previous_goal)/np.log(10)
433
+
434
+ return transformed_goal
435
+
436
+ except Exception as e:
437
+ print(f"Error processing previous funding goal for campaign {idx}: {str(e)}")
438
+ return 0.0
439
 
440
+ def process_previous_pledged(self, campaign: Dict, idx: int) -> float:
441
+ """
442
+ Process previous campaign pledged amount with logarithmic compression.
443
+
444
+ Applies Log1p transformation with base 10 to compress extreme values while
445
+ preserving relative differences between previous pledged amounts.
446
+
447
+ Args:
448
+ campaign (Dict): Campaign data
449
+ idx (int): Index of the campaign
450
+
451
+ Returns:
452
+ float: The transformed previous pledged amount
453
+ """
454
+ try:
455
+ pledged = float(campaign.get('previous_pledged', 0))
456
+
457
+ # Log1p transformation, good for general compression while preserving relative differences
458
+ transformed_pledge = np.log1p(pledged)/np.log(10)
459
+
460
+ return transformed_pledge
461
+
462
+ except Exception as e:
463
+ print(f"Error processing pledge amount for campaign {idx}: {str(e)}")
464
+ return 0.0
465
 
466
+ def calculate_previous_sucess_rate(self, campaign: Dict, idx: int) -> float:
467
+ """
468
+ Calculate success rate of creator's previous campaigns.
469
+
470
+ Computes the ratio of successful previous projects to total previous projects.
471
+
472
+ Args:
473
+ campaign (Dict): Campaign data
474
+ idx (int): Index of the campaign
475
+
476
+ Returns:
477
+ float: The previous success rate (0-1)
478
+ """
479
+ try:
480
+ previousProjects = float(campaign.get('previous_projects_count', 0))
481
+ previousSuccessfulProjects = float(campaign.get('previous_successful_projects', 0))
482
+
483
+ if previousProjects == 0.0:
484
+ return 0.0
485
+ else:
486
+ previous_success_rate = previousSuccessfulProjects / previousProjects
487
+ return previous_success_rate
488
+
489
+ except Exception as e:
490
+ print(f"Error calculating previous success rate for campaign {idx}: {str(e)}")
491
+ return 0.0
492
 
493
+ def process_campaign(self, campaign: Dict, idx: int) -> Dict:
494
+ """
495
+ Process a single campaign to prepare all required features for prediction.
496
+
497
+ This is the main method that processes a raw campaign and prepares
498
+ all features (embeddings and numerical) for the prediction model.
499
+
500
+ Processing steps include:
501
+ - Text embedding generation using appropriate models
502
+ - Category and country embedding through GloVe
503
+ - Logarithmic transformation of monetary values
504
+ - Normalization of numerical features
505
+
506
+ Args:
507
+ campaign (Dict): Raw campaign data
508
+ idx (int): Index of the campaign
509
+
510
+ Returns:
511
+ Dict: Processed data with all features ready for prediction
512
+ """
513
  self._ensure_models_loaded()
514
 
515
  # Generate embeddings for text fields
 
529
  'country_embedding': self.process_country_embedding(campaign, idx).tolist()
530
  }
531
 
532
+ # Process financial features with logarithmic transformation
533
+ result['funding_goal'] = self.process_funding_goal(campaign, idx)
534
+ result['previous_funding_goal'] = self.process_previous_funding_goal(campaign, idx)
535
+ result['previous_pledged'] = self.process_previous_pledged(campaign, idx)
536
+
537
+ # Calculate success rate based on previous projects
538
+ result['previous_success_rate'] = self.calculate_previous_sucess_rate(campaign, idx)
 
 
 
 
 
 
 
539
 
540
+ # Extract simple integer features
541
  for field in ['image_count', 'video_count', 'campaign_duration', 'previous_projects_count']:
542
  result[field] = int(campaign.get(field, 0))
543