from datetime import datetime import sys import logging from pinecone import Pinecone, ServerlessSpec from sentence_transformers import SentenceTransformer from typing import List, Dict, Any import os from dotenv import load_dotenv from typing import List, Dict, Any, Optional load_dotenv() PINECONE_API_KEY = os.getenv("PINECONE_API_KEY") if not PINECONE_API_KEY: raise ValueError("PINECONE_API_KEY is not set. Please check your environment or secrets configuration.") from settings import ( LOG_LEVEL, LOG_DATE_FORMAT, LOG_FORMAT, PINECONE_ENVIRONMENT, PINECONE_INDEX_NAME ) log = logging.getLogger(__name__) logging.basicConfig(stream=sys.stdout, level=LOG_LEVEL, format=LOG_FORMAT, datefmt=LOG_DATE_FORMAT) class PineconeHandler: """ Handles connections and operations with Pinecone vector database for storing and retrieving job ads """ def __init__(self): self.pc = Pinecone(api_key=PINECONE_API_KEY) self.BATCH_SIZE = 100 # Number of vectors to upsert at once try: self.index = self.pc.Index(PINECONE_INDEX_NAME) log.info(f"Connected to existing index '{PINECONE_INDEX_NAME}'") except Exception as e: log.info(f"Creating new index '{PINECONE_INDEX_NAME}'") spec = ServerlessSpec( cloud="aws", region="us-east-1" ) self.pc.create_index( name=PINECONE_INDEX_NAME, dimension=384, metric="cosine", spec=spec ) self.index = self.pc.Index(PINECONE_INDEX_NAME) #self.model = SentenceTransformer('all-MiniLM-L6-v2') #self.model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2') self.model = SentenceTransformer('forestav/job_matching_sentence_transformer') log.info(f"Initialized connection to Pinecone index '{PINECONE_INDEX_NAME}'") def _create_embedding(self, ad: Dict[str, Any]) -> List[float]: """Create embedding from job ad text""" try: # Safely get text fields with fallbacks to empty string headline = ad.get('headline', '') or '' occupation = ad.get('occupation', {}) occupation_label = occupation.get('label', '') if occupation else '' description = ad.get('description', {}) description_text = description.get('text', '') if description else '' # Combine text fields text_to_embed = f"{headline} {occupation_label} {description_text}".strip() # If we have no text to embed, raise an exception if not text_to_embed: raise ValueError("No text content available for embedding") return self.model.encode(text_to_embed).tolist() except Exception as e: log.error(f"Error creating embedding for ad {ad.get('id', 'unknown')}: {str(e)}") raise def _prepare_metadata(self, ad: Dict[str, Any]) -> Dict[str, str]: """Extract metadata from ad for storage""" try: # Safely get nested values with fallbacks application_details = ad.get('application_details', {}) or {} workplace_address = ad.get('workplace_address', {}) or {} occupation = ad.get('occupation', {}) or {} description = ad.get('description', {}) or {} # Limit the size of text fields and handle potential None values return { 'email': (application_details.get('email', '') or '')[:100], 'city': (workplace_address.get('municipality', '') or '')[:100], 'occupation': (occupation.get('label', '') or '')[:100], 'headline': (ad.get('headline', '') or '')[:200], 'description': (description.get('text', '') or '')[:2000], 'logo_url': (ad.get('logo_url', '') or '')[:200], 'webpage_url': (ad.get('webpage_url', '') or '')[:200], 'published': (ad.get('publication_date', '') or '')[:50] } except Exception as e: log.error(f"Error preparing metadata for ad {ad.get('id', 'unknown')}: {str(e)}") raise def _batch_upsert(self, vectors: List[tuple]) -> None: """ Upsert a batch of vectors to Pinecone Args: vectors: List of tuples, each containing (id, vector, metadata) """ try: # Prepare the vectors in the format Pinecone expects upsert_data = [(str(id), vec, meta) for id, vec, meta in vectors] # Perform the upsert operation self.index.upsert(vectors=upsert_data) log.debug(f"Successfully upserted batch of {len(vectors)} vectors") except Exception as e: log.error(f"Error upserting batch: {str(e)}") raise def upsert_ads(self, ads: List[Dict[str, Any]]) -> None: """Insert or update multiple ads in batches""" vectors = [] deleted = 0 processed = 0 skipped = 0 for ad in ads: try: # Skip None or empty ads if not ad: log.warning("Skipping None or empty ad") skipped += 1 continue ad_id = ad.get('id') if not ad_id: log.warning("Skipping ad without ID") skipped += 1 continue if ad.get('removed', False): self.delete_ad(ad_id) deleted += 1 continue try: vector = self._create_embedding(ad) metadata = self._prepare_metadata(ad) vectors.append((ad_id, vector, metadata)) processed += 1 # When we reach batch size, upsert the batch if len(vectors) >= self.BATCH_SIZE: self._batch_upsert(vectors) vectors = [] # Clear the batch except Exception as e: log.error(f"Error processing ad {ad_id}: {str(e)}") skipped += 1 except Exception as e: log.error(f"Unexpected error processing ad: {str(e)}") skipped += 1 # Upsert any remaining vectors if vectors: self._batch_upsert(vectors) log.info(f"Processing complete: {processed} ads upserted, {deleted} deleted, {skipped} skipped") def delete_ad(self, ad_id: str) -> None: """Delete an ad by ID""" try: self.index.delete(ids=[ad_id]) log.debug(f"Deleted ad {ad_id} from Pinecone") except Exception as e: log.error(f"Error deleting ad {ad_id}: {str(e)}") def search_similar_ads(self, query: str, top_k: int = 5, city: Optional[str] = None) -> List[Dict[str, Any]]: """Search for similar job ads based on text query with optional city filtering.""" query_embedding = self.model.encode(query).tolist() # Build the filter dictionary if city is provided metadata_filter = {} if city: city = city.lower().strip() # Normalize city = city[0].upper() + city[1:] # Capitalize first letter metadata_filter["city"] = {"$eq": city} # Execute the Pinecone query with optional metadata filtering results = self.index.query( vector=query_embedding, top_k=top_k, include_metadata=True, filter=metadata_filter if metadata_filter else None ) return results.matches def load_all(all_ads): handler = PineconeHandler() handler.upsert_ads(all_ads) def update(list_of_updated_ads): start = datetime.now() handler = PineconeHandler() handler.upsert_ads(list_of_updated_ads) log.info(f"{len(list_of_updated_ads)} ads processed. Time: {datetime.now() - start}")