Spaces:

forestav
/

jobsai

Running

File size: 8,365 Bytes

from datetime import datetime
import sys
import logging
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer
from typing import List, Dict, Any
import os
from dotenv import load_dotenv
from typing import List, Dict, Any, Optional

load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
if not PINECONE_API_KEY:
    raise ValueError("PINECONE_API_KEY is not set. Please check your environment or secrets configuration.")

from settings import (
    LOG_LEVEL, 
    LOG_DATE_FORMAT, 
    LOG_FORMAT, 
    PINECONE_ENVIRONMENT,
    PINECONE_INDEX_NAME
)

log = logging.getLogger(__name__)
logging.basicConfig(stream=sys.stdout, level=LOG_LEVEL, format=LOG_FORMAT, datefmt=LOG_DATE_FORMAT)

class PineconeHandler:
    """
    Handles connections and operations with Pinecone vector database
    for storing and retrieving job ads
    """
    def __init__(self):
        self.pc = Pinecone(api_key=PINECONE_API_KEY)
        self.BATCH_SIZE = 100  # Number of vectors to upsert at once
        
        try:
            self.index = self.pc.Index(PINECONE_INDEX_NAME)
            log.info(f"Connected to existing index '{PINECONE_INDEX_NAME}'")
        except Exception as e:
            log.info(f"Creating new index '{PINECONE_INDEX_NAME}'")
            spec = ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            )
            
            self.pc.create_index(
                name=PINECONE_INDEX_NAME,
                dimension=384,
                metric="cosine",
                spec=spec
            )
            self.index = self.pc.Index(PINECONE_INDEX_NAME)
        
        #self.model = SentenceTransformer('all-MiniLM-L6-v2')
        #self.model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
        self.model = SentenceTransformer('forestav/job_matching_sentence_transformer')
        log.info(f"Initialized connection to Pinecone index '{PINECONE_INDEX_NAME}'")

    def _create_embedding(self, ad: Dict[str, Any]) -> List[float]:
        """Create embedding from job ad text"""
        try:
            # Safely get text fields with fallbacks to empty string
            headline = ad.get('headline', '') or ''
            occupation = ad.get('occupation', {})
            occupation_label = occupation.get('label', '') if occupation else ''
            description = ad.get('description', {})
            description_text = description.get('text', '') if description else ''
            
            # Combine text fields
            text_to_embed = f"{headline} {occupation_label} {description_text}".strip()
            
            # If we have no text to embed, raise an exception
            if not text_to_embed:
                raise ValueError("No text content available for embedding")
                
            return self.model.encode(text_to_embed).tolist()
        except Exception as e:
            log.error(f"Error creating embedding for ad {ad.get('id', 'unknown')}: {str(e)}")
            raise

    def _prepare_metadata(self, ad: Dict[str, Any]) -> Dict[str, str]:
        """Extract metadata from ad for storage"""
        try:
            # Safely get nested values with fallbacks
            application_details = ad.get('application_details', {}) or {}
            workplace_address = ad.get('workplace_address', {}) or {}
            occupation = ad.get('occupation', {}) or {}
            description = ad.get('description', {}) or {}
            
            # Limit the size of text fields and handle potential None values
            return {
                'email': (application_details.get('email', '') or '')[:100],
                'city': (workplace_address.get('municipality', '') or '')[:100],
                'occupation': (occupation.get('label', '') or '')[:100],
                'headline': (ad.get('headline', '') or '')[:200],
                'description': (description.get('text', '') or '')[:2000],
                'logo_url': (ad.get('logo_url', '') or '')[:200],
                'webpage_url': (ad.get('webpage_url', '') or '')[:200],
                'published': (ad.get('publication_date', '') or '')[:50]
            }
        except Exception as e:
            log.error(f"Error preparing metadata for ad {ad.get('id', 'unknown')}: {str(e)}")
            raise

    def _batch_upsert(self, vectors: List[tuple]) -> None:
        """
        Upsert a batch of vectors to Pinecone
        
        Args:
            vectors: List of tuples, each containing (id, vector, metadata)
        """
        try:
            # Prepare the vectors in the format Pinecone expects
            upsert_data = [(str(id), vec, meta) for id, vec, meta in vectors]
            
            # Perform the upsert operation
            self.index.upsert(vectors=upsert_data)
            
            log.debug(f"Successfully upserted batch of {len(vectors)} vectors")
        except Exception as e:
            log.error(f"Error upserting batch: {str(e)}")
            raise

    def upsert_ads(self, ads: List[Dict[str, Any]]) -> None:
        """Insert or update multiple ads in batches"""
        vectors = []
        deleted = 0
        processed = 0
        skipped = 0
        
        for ad in ads:
            try:
                # Skip None or empty ads
                if not ad:
                    log.warning("Skipping None or empty ad")
                    skipped += 1
                    continue

                ad_id = ad.get('id')
                if not ad_id:
                    log.warning("Skipping ad without ID")
                    skipped += 1
                    continue
                    
                if ad.get('removed', False):
                    self.delete_ad(ad_id)
                    deleted += 1
                    continue
                
                try:
                    vector = self._create_embedding(ad)
                    metadata = self._prepare_metadata(ad)
                    vectors.append((ad_id, vector, metadata))
                    processed += 1
                    
                    # When we reach batch size, upsert the batch
                    if len(vectors) >= self.BATCH_SIZE:
                        self._batch_upsert(vectors)
                        vectors = []  # Clear the batch
                        
                except Exception as e:
                    log.error(f"Error processing ad {ad_id}: {str(e)}")
                    skipped += 1
                    
            except Exception as e:
                log.error(f"Unexpected error processing ad: {str(e)}")
                skipped += 1
        
        # Upsert any remaining vectors
        if vectors:
            self._batch_upsert(vectors)
        
        log.info(f"Processing complete: {processed} ads upserted, {deleted} deleted, {skipped} skipped")

    def delete_ad(self, ad_id: str) -> None:
        """Delete an ad by ID"""
        try:
            self.index.delete(ids=[ad_id])
            log.debug(f"Deleted ad {ad_id} from Pinecone")
        except Exception as e:
            log.error(f"Error deleting ad {ad_id}: {str(e)}")

    def search_similar_ads(self, query: str, top_k: int = 5, city: Optional[str] = None) -> List[Dict[str, Any]]:
        """Search for similar job ads based on text query with optional city filtering."""
        query_embedding = self.model.encode(query).tolist()
        
        # Build the filter dictionary if city is provided
        metadata_filter = {}
        if city:
            city = city.lower().strip()  # Normalize
            city = city[0].upper() + city[1:]  # Capitalize first letter
            metadata_filter["city"] = {"$eq": city}

        # Execute the Pinecone query with optional metadata filtering
        results = self.index.query(
            vector=query_embedding,
            top_k=top_k,
            include_metadata=True,
            filter=metadata_filter if metadata_filter else None
        )
        return results.matches

def load_all(all_ads):
    handler = PineconeHandler()
    handler.upsert_ads(all_ads)

def update(list_of_updated_ads):
    start = datetime.now()
    handler = PineconeHandler()
    handler.upsert_ads(list_of_updated_ads)
    log.info(f"{len(list_of_updated_ads)} ads processed. Time: {datetime.now() - start}")