Spaces:

angusfung
/

Kickstarter-prediction-embedding

Sleeping

App Files Files Community

angusfung commited on Apr 22

Commit

7e4ff82

verified ·

1 Parent(s): a84b543

updated numerical feature engineering

Browse files

Files changed (1) hide show

src/ProcessOneSingleCampaign.py +289 -41

src/ProcessOneSingleCampaign.py CHANGED Viewed

@@ -1,3 +1,19 @@
 import os
 # Set gensim data directory to a writable location at the very start
 os.environ['GENSIM_DATA_DIR'] = '/tmp/gensim-data'
@@ -9,29 +25,57 @@ except Exception as e:
 import json
 import numpy as np
-from typing import Dict
 import torch
 from transformers import AutoTokenizer, AutoModel
 import gc
 import gensim.downloader
 class CampaignProcessor:
-    def __init__(self, data, lazy_load=False):
         self.data = data
         self.categories = sorted(list(set(camp.get('raw_category', '') for camp in self.data)))
         self.lazy_load = lazy_load
-        self.tokenizer = None
-        self.model = None
-        self.RiskandBlurb_tokenizer = None
-        self.RiskandBlurb_model = None
-        self.glove = None
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         if not lazy_load:
             self._load_models()
     def _load_models(self):
         print("Loading NLP models...")
         # Cache models locally to avoid downloading every time
         cache_dir = "/tmp/model_cache"
@@ -120,33 +164,60 @@ class CampaignProcessor:
             raise e
     def _ensure_models_loaded(self):
         if self.model is None or self.tokenizer is None or self.RiskandBlurb_model is None or self.RiskandBlurb_tokenizer is None or self.glove is None:
             self._load_models()
-    def _process_text_embedding(self, text, max_length, tokenizer, model):
-        # Common function for text embedding generation
         if self.device.type == 'cuda':
             torch.cuda.empty_cache()
         gc.collect()
         inputs = tokenizer(text,
                         padding=True,
                         truncation=True,
                         max_length=max_length,
                         return_tensors="pt")
         inputs = {k: v.to(self.device) for k, v in inputs.items()}
         with torch.no_grad():
             outputs = model(**inputs)
         attention_mask = inputs['attention_mask']
         token_embeddings = outputs.last_hidden_state
         input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
         sentence_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
         embedding = sentence_embeddings.cpu().numpy()
         del inputs, outputs, token_embeddings, sentence_embeddings
         if self.device.type == 'cuda':
             torch.cuda.empty_cache()
@@ -154,8 +225,17 @@ class CampaignProcessor:
         return embedding[0]
-    def _get_glove_embedding(self, text, dim=100):
-        # Common function for GloVe embeddings (subcategory and country)
         if not text:
             return np.zeros(dim)
@@ -164,16 +244,30 @@ class CampaignProcessor:
         words = text.split()
         vectors = []
         for word in words:
             if word in self.glove:
                 vectors.append(self.glove[word])
         if vectors:
             return np.mean(vectors, axis=0)
         else:
             return np.zeros(dim)
-    def process_description_embedding(self, campaign: Dict, idx: int):
         self._ensure_models_loaded()
         try:
@@ -185,7 +279,17 @@ class CampaignProcessor:
             print(f"Error processing description: {str(e)}")
             return np.zeros(768), 0
-    def process_riskandchallenges_embedding(self, campaign: Dict, idx: int):
         self._ensure_models_loaded()
         try:
@@ -195,7 +299,17 @@ class CampaignProcessor:
             print(f"Error processing risk statement: {str(e)}")
             return np.zeros(384)
-    def process_blurb(self, campaign: Dict, idx: int):
         self._ensure_models_loaded()
         try:
@@ -205,7 +319,16 @@ class CampaignProcessor:
             print(f"Error processing blurb: {str(e)}")
             return np.zeros(384)
-    def process_category(self, campaign: Dict):
         try:
             # All categories in the dataset
             fixed_categories = [
@@ -222,7 +345,17 @@ class CampaignProcessor:
             print(f"Error processing category: {str(e)}")
             return [0] * 15
-    def process_subcategory_embedding(self, campaign: Dict, idx: int):
         self._ensure_models_loaded()
         try:
@@ -232,7 +365,17 @@ class CampaignProcessor:
             print(f"Error processing subcategory: {str(e)}")
             return np.zeros(100)
-    def process_country_embedding(self, campaign: Dict, idx: int):
         self._ensure_models_loaded()
         try:
@@ -242,19 +385,131 @@ class CampaignProcessor:
             print(f"Error processing country: {str(e)}")
             return np.zeros(100)
-    def process_funding_goal(self, campaign: Dict, idx: int):
-        return float(campaign.get('funding_goal', 0))
-    def process_previous_funding_goal(self, campaign: Dict, idx: int):
-        return float(campaign.get('previous_funding_goal', 0))
-    def process_previous_pledged(self, campaign: Dict, idx: int):
-        return float(campaign.get('previous_pledged', 0))
-    def calculate_previous_sucess_rate(self, campaign: Dict, idx: int):
-        return float(campaign.get('previous_success_rate', 0))
-    def process_campaign(self, campaign: Dict, idx: int):
         self._ensure_models_loaded()
         # Generate embeddings for text fields
@@ -274,22 +529,15 @@ class CampaignProcessor:
             'country_embedding': self.process_country_embedding(campaign, idx).tolist()
         }
-        # Process numerical features or use existing values from input
-        numerical_fields = [
-            ('funding_goal', self.process_funding_goal),
-            ('previous_funding_goal', self.process_previous_funding_goal),
-            ('previous_pledged', self.process_previous_pledged),
-            ('previous_success_rate', self.calculate_previous_sucess_rate)
-        ]
-        # Process numerical features or use values from input
-        for field_name, processor_func in numerical_fields:
-            if field_name in campaign:
-                result[field_name] = campaign[field_name]
-            else:
-                result[field_name] = processor_func(campaign, idx)
-        # Simple integer fields
         for field in ['image_count', 'video_count', 'campaign_duration', 'previous_projects_count']:
             result[field] = int(campaign.get(field, 0))

+"""
+Campaign Data Processor for Kickstarter Prediction
+This module handles the preprocessing of raw Kickstarter campaign data,
+generating text embeddings and preparing numerical features for prediction.
+Key functionality:
+- Longformer embeddings for project descriptions
+- Sentence transformer embeddings for blurbs and risk statements
+- GloVe embeddings for categories and countries
+- Normalization of numerical features
+Author: Angus Fung
+Date: April 2025
+"""
 import os
 # Set gensim data directory to a writable location at the very start
 os.environ['GENSIM_DATA_DIR'] = '/tmp/gensim-data'
 import json
 import numpy as np
+from typing import Dict, List, Tuple, Any, Optional
 import torch
 from transformers import AutoTokenizer, AutoModel
 import gc
 import gensim.downloader
 class CampaignProcessor:
+    """
+    Processor for Kickstarter campaign data.
+    This class handles the preprocessing of raw campaign data, transforming
+    text and categorical features into embeddings using various NLP models
+    and preparing numerical features for the prediction model.
+    """
+    def __init__(self, data: List[Dict], lazy_load: bool = False):
+        """
+        Initialize the CampaignProcessor.
+        Args:
+            data (List[Dict]): List of campaign dictionaries to process
+            lazy_load (bool): If True, models will be loaded only when needed
+                             rather than at initialization time
+        """
         self.data = data
         self.categories = sorted(list(set(camp.get('raw_category', '') for camp in self.data)))
         self.lazy_load = lazy_load
+        # Initialize model variables (to be loaded later)
+        self.tokenizer = None  # Longformer tokenizer for descriptions
+        self.model = None  # Longformer model for descriptions
+        self.RiskandBlurb_tokenizer = None  # MiniLM tokenizer for blurb and risk
+        self.RiskandBlurb_model = None  # MiniLM model for blurb and risk
+        self.glove = None  # GloVe word vectors for categories and countries
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        # Load models at initialization if not using lazy loading
         if not lazy_load:
             self._load_models()
     def _load_models(self):
+        """
+        Load all NLP models required for processing campaign data.
+        This method loads:
+        - Longformer for description embeddings
+        - MiniLM for blurb and risk embeddings
+        - GloVe for category and country embeddings
+        Models are cached to avoid reloading and moved to the appropriate device.
+        """
         print("Loading NLP models...")
         # Cache models locally to avoid downloading every time
         cache_dir = "/tmp/model_cache"
             raise e
     def _ensure_models_loaded(self):
+        """
+        Ensure all required models are loaded.
+        This is called before any processing to make sure models are ready,
+        particularly important when using lazy loading.
+        """
         if self.model is None or self.tokenizer is None or self.RiskandBlurb_model is None or self.RiskandBlurb_tokenizer is None or self.glove is None:
             self._load_models()
+    def _process_text_embedding(self, text: str, max_length: int, tokenizer: AutoTokenizer, model: AutoModel) -> np.ndarray:
+        """
+        Generate embedding for text using the specified model and tokenizer.
+        This method handles tokenization, model inference, and pooling to
+        create a single vector representation of the input text.
+        Args:
+            text (str): Text to embed
+            max_length (int): Maximum token length for the model
+            tokenizer (AutoTokenizer): Tokenizer to use
+            model (AutoModel): Model to use for embedding generation
+        Returns:
+            np.ndarray: Embedding vector for the text
+        """
+        # Clean up memory before processing
         if self.device.type == 'cuda':
             torch.cuda.empty_cache()
         gc.collect()
+        # Tokenize the text
         inputs = tokenizer(text,
                         padding=True,
                         truncation=True,
                         max_length=max_length,
                         return_tensors="pt")
+        # Move inputs to device
         inputs = {k: v.to(self.device) for k, v in inputs.items()}
+        # Generate embeddings
         with torch.no_grad():
             outputs = model(**inputs)
+        # Mean pooling - take average of all token embeddings
         attention_mask = inputs['attention_mask']
         token_embeddings = outputs.last_hidden_state
         input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
         sentence_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+        # Convert to numpy array
         embedding = sentence_embeddings.cpu().numpy()
+        # Clean up to prevent memory leaks
         del inputs, outputs, token_embeddings, sentence_embeddings
         if self.device.type == 'cuda':
             torch.cuda.empty_cache()
         return embedding[0]
+    def _get_glove_embedding(self, text: str, dim: int = 100) -> np.ndarray:
+        """
+        Generate GloVe embedding for a text by averaging word vectors.
+        Args:
+            text (str): Text to embed
+            dim (int): Dimension of the GloVe embeddings
+        Returns:
+            np.ndarray: Averaged GloVe embedding for the text
+        """
         if not text:
             return np.zeros(dim)
         words = text.split()
         vectors = []
+        # Collect vectors for words that exist in the vocabulary
         for word in words:
             if word in self.glove:
                 vectors.append(self.glove[word])
+        # Average vectors if any exist, otherwise return zeros
         if vectors:
             return np.mean(vectors, axis=0)
         else:
             return np.zeros(dim)
+    def process_description_embedding(self, campaign: Dict, idx: int) -> Tuple[np.ndarray, int]:
+        """
+        Process the project description to generate a Longformer embedding.
+        Args:
+            campaign (Dict): Campaign data
+            idx (int): Index of the campaign
+        Returns:
+            Tuple containing:
+                - np.ndarray: Longformer embedding of the description
+                - int: Word count of the description
+        """
         self._ensure_models_loaded()
         try:
             print(f"Error processing description: {str(e)}")
             return np.zeros(768), 0
+    def process_riskandchallenges_embedding(self, campaign: Dict, idx: int) -> np.ndarray:
+        """
+        Process the risks and challenges section to generate a MiniLM embedding.
+        Args:
+            campaign (Dict): Campaign data
+            idx (int): Index of the campaign
+        Returns:
+            np.ndarray: MiniLM embedding of the risks section
+        """
         self._ensure_models_loaded()
         try:
             print(f"Error processing risk statement: {str(e)}")
             return np.zeros(384)
+    def process_blurb(self, campaign: Dict, idx: int) -> np.ndarray:
+        """
+        Process the project blurb to generate a MiniLM embedding.
+        Args:
+            campaign (Dict): Campaign data
+            idx (int): Index of the campaign
+        Returns:
+            np.ndarray: MiniLM embedding of the blurb
+        """
         self._ensure_models_loaded()
         try:
             print(f"Error processing blurb: {str(e)}")
             return np.zeros(384)
+    def process_category(self, campaign: Dict) -> List[int]:
+        """
+        Process the project category into a one-hot encoding.
+        Args:
+            campaign (Dict): Campaign data
+        Returns:
+            List[int]: One-hot encoding of the category
+        """
         try:
             # All categories in the dataset
             fixed_categories = [
             print(f"Error processing category: {str(e)}")
             return [0] * 15
+    def process_subcategory_embedding(self, campaign: Dict, idx: int) -> np.ndarray:
+        """
+        Process the project subcategory to generate a GloVe embedding.
+        Args:
+            campaign (Dict): Campaign data
+            idx (int): Index of the campaign
+        Returns:
+            np.ndarray: GloVe embedding of the subcategory
+        """
         self._ensure_models_loaded()
         try:
             print(f"Error processing subcategory: {str(e)}")
             return np.zeros(100)
+    def process_country_embedding(self, campaign: Dict, idx: int) -> np.ndarray:
+        """
+        Process the project country to generate a GloVe embedding.
+        Args:
+            campaign (Dict): Campaign data
+            idx (int): Index of the campaign
+        Returns:
+            np.ndarray: GloVe embedding of the country
+        """
         self._ensure_models_loaded()
         try:
             print(f"Error processing country: {str(e)}")
             return np.zeros(100)
+    def process_funding_goal(self, campaign: Dict, idx: int) -> float:
+        """
+        Process campaign funding goal with logarithmic compression.
+        Applies Log1p transformation with base 10 to compress extreme values while
+        preserving relative differences between funding goals.
+        Args:
+            campaign (Dict): Campaign data
+            idx (int): Index of the campaign
+        Returns:
+            float: The transformed funding goal
+        """
+        try:
+            goal = float(campaign.get('funding_goal', 0))
+            # Log1p transformation, good for general compression while preserving relative differences
+            transformed_goal = np.log1p(goal)/np.log(10)
+            return transformed_goal
+        except Exception as e:
+            print(f"Error processing funding goal for campaign {idx}: {str(e)}")
+            return 0.0
+    def process_previous_funding_goal(self, campaign: Dict, idx: int) -> float:
+        """
+        Process previous campaign funding goal with logarithmic compression.
+        Applies Log1p transformation with base 10 to compress extreme values while
+        preserving relative differences between previous funding goals.
+        Args:
+            campaign (Dict): Campaign data
+            idx (int): Index of the campaign
+        Returns:
+            float: The transformed previous funding goal
+        """
+        try:
+            previous_goal = float(campaign.get('previous_funding_goal', 0))
+            # Log1p transformation, good for general compression while preserving relative differences
+            transformed_goal = np.log1p(previous_goal)/np.log(10)
+            return transformed_goal
+        except Exception as e:
+            print(f"Error processing previous funding goal for campaign {idx}: {str(e)}")
+            return 0.0
+    def process_previous_pledged(self, campaign: Dict, idx: int) -> float:
+        """
+        Process previous campaign pledged amount with logarithmic compression.
+        Applies Log1p transformation with base 10 to compress extreme values while
+        preserving relative differences between previous pledged amounts.
+        Args:
+            campaign (Dict): Campaign data
+            idx (int): Index of the campaign
+        Returns:
+            float: The transformed previous pledged amount
+        """
+        try:
+            pledged = float(campaign.get('previous_pledged', 0))
+            # Log1p transformation, good for general compression while preserving relative differences
+            transformed_pledge = np.log1p(pledged)/np.log(10)
+            return transformed_pledge
+        except Exception as e:
+            print(f"Error processing pledge amount for campaign {idx}: {str(e)}")
+            return 0.0
+    def calculate_previous_sucess_rate(self, campaign: Dict, idx: int) -> float:
+        """
+        Calculate success rate of creator's previous campaigns.
+        Computes the ratio of successful previous projects to total previous projects.
+        Args:
+            campaign (Dict): Campaign data
+            idx (int): Index of the campaign
+        Returns:
+            float: The previous success rate (0-1)
+        """
+        try:
+            previousProjects = float(campaign.get('previous_projects_count', 0))
+            previousSuccessfulProjects = float(campaign.get('previous_successful_projects', 0))
+            if previousProjects == 0.0:
+                return 0.0
+            else:
+                previous_success_rate = previousSuccessfulProjects / previousProjects
+                return previous_success_rate
+        except Exception as e:
+            print(f"Error calculating previous success rate for campaign {idx}: {str(e)}")
+            return 0.0
+    def process_campaign(self, campaign: Dict, idx: int) -> Dict:
+        """
+        Process a single campaign to prepare all required features for prediction.
+        This is the main method that processes a raw campaign and prepares
+        all features (embeddings and numerical) for the prediction model.
+        Processing steps include:
+        - Text embedding generation using appropriate models
+        - Category and country embedding through GloVe
+        - Logarithmic transformation of monetary values
+        - Normalization of numerical features
+        Args:
+            campaign (Dict): Raw campaign data
+            idx (int): Index of the campaign
+        Returns:
+            Dict: Processed data with all features ready for prediction
+        """
         self._ensure_models_loaded()
         # Generate embeddings for text fields
             'country_embedding': self.process_country_embedding(campaign, idx).tolist()
         }
+        # Process financial features with logarithmic transformation
+        result['funding_goal'] = self.process_funding_goal(campaign, idx)
+        result['previous_funding_goal'] = self.process_previous_funding_goal(campaign, idx)
+        result['previous_pledged'] = self.process_previous_pledged(campaign, idx)
+        # Calculate success rate based on previous projects
+        result['previous_success_rate'] = self.calculate_previous_sucess_rate(campaign, idx)
+        # Extract simple integer features
         for field in ['image_count', 'video_count', 'campaign_duration', 'previous_projects_count']:
             result[field] = int(campaign.get(field, 0))