jobsai / pinecone_handler.py
forestav's picture
update
ba6957d
raw
history blame
8.37 kB
from datetime import datetime
import sys
import logging
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer
from typing import List, Dict, Any
import os
from dotenv import load_dotenv
from typing import List, Dict, Any, Optional
load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
if not PINECONE_API_KEY:
raise ValueError("PINECONE_API_KEY is not set. Please check your environment or secrets configuration.")
from settings import (
LOG_LEVEL,
LOG_DATE_FORMAT,
LOG_FORMAT,
PINECONE_ENVIRONMENT,
PINECONE_INDEX_NAME
)
log = logging.getLogger(__name__)
logging.basicConfig(stream=sys.stdout, level=LOG_LEVEL, format=LOG_FORMAT, datefmt=LOG_DATE_FORMAT)
class PineconeHandler:
"""
Handles connections and operations with Pinecone vector database
for storing and retrieving job ads
"""
def __init__(self):
self.pc = Pinecone(api_key=PINECONE_API_KEY)
self.BATCH_SIZE = 100 # Number of vectors to upsert at once
try:
self.index = self.pc.Index(PINECONE_INDEX_NAME)
log.info(f"Connected to existing index '{PINECONE_INDEX_NAME}'")
except Exception as e:
log.info(f"Creating new index '{PINECONE_INDEX_NAME}'")
spec = ServerlessSpec(
cloud="aws",
region="us-east-1"
)
self.pc.create_index(
name=PINECONE_INDEX_NAME,
dimension=384,
metric="cosine",
spec=spec
)
self.index = self.pc.Index(PINECONE_INDEX_NAME)
#self.model = SentenceTransformer('all-MiniLM-L6-v2')
#self.model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
self.model = SentenceTransformer('forestav/job_matching_sentence_transformer')
log.info(f"Initialized connection to Pinecone index '{PINECONE_INDEX_NAME}'")
def _create_embedding(self, ad: Dict[str, Any]) -> List[float]:
"""Create embedding from job ad text"""
try:
# Safely get text fields with fallbacks to empty string
headline = ad.get('headline', '') or ''
occupation = ad.get('occupation', {})
occupation_label = occupation.get('label', '') if occupation else ''
description = ad.get('description', {})
description_text = description.get('text', '') if description else ''
# Combine text fields
text_to_embed = f"{headline} {occupation_label} {description_text}".strip()
# If we have no text to embed, raise an exception
if not text_to_embed:
raise ValueError("No text content available for embedding")
return self.model.encode(text_to_embed).tolist()
except Exception as e:
log.error(f"Error creating embedding for ad {ad.get('id', 'unknown')}: {str(e)}")
raise
def _prepare_metadata(self, ad: Dict[str, Any]) -> Dict[str, str]:
"""Extract metadata from ad for storage"""
try:
# Safely get nested values with fallbacks
application_details = ad.get('application_details', {}) or {}
workplace_address = ad.get('workplace_address', {}) or {}
occupation = ad.get('occupation', {}) or {}
description = ad.get('description', {}) or {}
# Limit the size of text fields and handle potential None values
return {
'email': (application_details.get('email', '') or '')[:100],
'city': (workplace_address.get('municipality', '') or '')[:100],
'occupation': (occupation.get('label', '') or '')[:100],
'headline': (ad.get('headline', '') or '')[:200],
'description': (description.get('text', '') or '')[:2000],
'logo_url': (ad.get('logo_url', '') or '')[:200],
'webpage_url': (ad.get('webpage_url', '') or '')[:200],
'published': (ad.get('publication_date', '') or '')[:50]
}
except Exception as e:
log.error(f"Error preparing metadata for ad {ad.get('id', 'unknown')}: {str(e)}")
raise
def _batch_upsert(self, vectors: List[tuple]) -> None:
"""
Upsert a batch of vectors to Pinecone
Args:
vectors: List of tuples, each containing (id, vector, metadata)
"""
try:
# Prepare the vectors in the format Pinecone expects
upsert_data = [(str(id), vec, meta) for id, vec, meta in vectors]
# Perform the upsert operation
self.index.upsert(vectors=upsert_data)
log.debug(f"Successfully upserted batch of {len(vectors)} vectors")
except Exception as e:
log.error(f"Error upserting batch: {str(e)}")
raise
def upsert_ads(self, ads: List[Dict[str, Any]]) -> None:
"""Insert or update multiple ads in batches"""
vectors = []
deleted = 0
processed = 0
skipped = 0
for ad in ads:
try:
# Skip None or empty ads
if not ad:
log.warning("Skipping None or empty ad")
skipped += 1
continue
ad_id = ad.get('id')
if not ad_id:
log.warning("Skipping ad without ID")
skipped += 1
continue
if ad.get('removed', False):
self.delete_ad(ad_id)
deleted += 1
continue
try:
vector = self._create_embedding(ad)
metadata = self._prepare_metadata(ad)
vectors.append((ad_id, vector, metadata))
processed += 1
# When we reach batch size, upsert the batch
if len(vectors) >= self.BATCH_SIZE:
self._batch_upsert(vectors)
vectors = [] # Clear the batch
except Exception as e:
log.error(f"Error processing ad {ad_id}: {str(e)}")
skipped += 1
except Exception as e:
log.error(f"Unexpected error processing ad: {str(e)}")
skipped += 1
# Upsert any remaining vectors
if vectors:
self._batch_upsert(vectors)
log.info(f"Processing complete: {processed} ads upserted, {deleted} deleted, {skipped} skipped")
def delete_ad(self, ad_id: str) -> None:
"""Delete an ad by ID"""
try:
self.index.delete(ids=[ad_id])
log.debug(f"Deleted ad {ad_id} from Pinecone")
except Exception as e:
log.error(f"Error deleting ad {ad_id}: {str(e)}")
def search_similar_ads(self, query: str, top_k: int = 5, city: Optional[str] = None) -> List[Dict[str, Any]]:
"""Search for similar job ads based on text query with optional city filtering."""
query_embedding = self.model.encode(query).tolist()
# Build the filter dictionary if city is provided
metadata_filter = {}
if city:
city = city.lower().strip() # Normalize
city = city[0].upper() + city[1:] # Capitalize first letter
metadata_filter["city"] = {"$eq": city}
# Execute the Pinecone query with optional metadata filtering
results = self.index.query(
vector=query_embedding,
top_k=top_k,
include_metadata=True,
filter=metadata_filter if metadata_filter else None
)
return results.matches
def load_all(all_ads):
handler = PineconeHandler()
handler.upsert_ads(all_ads)
def update(list_of_updated_ads):
start = datetime.now()
handler = PineconeHandler()
handler.upsert_ads(list_of_updated_ads)
log.info(f"{len(list_of_updated_ads)} ads processed. Time: {datetime.now() - start}")