Spaces:

forestav
/

jobsai

Running

App Files Files Community

jobsai / pinecone_handler.py

forestav

update

ba6957d 6 months ago

raw

history blame

8.37 kB

	from datetime import datetime
	import sys
	import logging
	from pinecone import Pinecone, ServerlessSpec
	from sentence_transformers import SentenceTransformer
	from typing import List, Dict, Any
	import os
	from dotenv import load_dotenv
	from typing import List, Dict, Any, Optional

	load_dotenv()
	PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
	if not PINECONE_API_KEY:
	raise ValueError("PINECONE_API_KEY is not set. Please check your environment or secrets configuration.")

	from settings import (
	LOG_LEVEL,
	LOG_DATE_FORMAT,
	LOG_FORMAT,
	PINECONE_ENVIRONMENT,
	PINECONE_INDEX_NAME
	)

	log = logging.getLogger(__name__)
	logging.basicConfig(stream=sys.stdout, level=LOG_LEVEL, format=LOG_FORMAT, datefmt=LOG_DATE_FORMAT)

	class PineconeHandler:
	"""
	Handles connections and operations with Pinecone vector database
	for storing and retrieving job ads
	"""
	def __init__(self):
	self.pc = Pinecone(api_key=PINECONE_API_KEY)
	self.BATCH_SIZE = 100 # Number of vectors to upsert at once

	try:
	self.index = self.pc.Index(PINECONE_INDEX_NAME)
	log.info(f"Connected to existing index '{PINECONE_INDEX_NAME}'")
	except Exception as e:
	log.info(f"Creating new index '{PINECONE_INDEX_NAME}'")
	spec = ServerlessSpec(
	cloud="aws",
	region="us-east-1"
	)

	self.pc.create_index(
	name=PINECONE_INDEX_NAME,
	dimension=384,
	metric="cosine",
	spec=spec
	)
	self.index = self.pc.Index(PINECONE_INDEX_NAME)

	#self.model = SentenceTransformer('all-MiniLM-L6-v2')
	#self.model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
	self.model = SentenceTransformer('forestav/job_matching_sentence_transformer')
	log.info(f"Initialized connection to Pinecone index '{PINECONE_INDEX_NAME}'")

	def _create_embedding(self, ad: Dict[str, Any]) -> List[float]:
	"""Create embedding from job ad text"""
	try:
	# Safely get text fields with fallbacks to empty string
	headline = ad.get('headline', '') or ''
	occupation = ad.get('occupation', {})
	occupation_label = occupation.get('label', '') if occupation else ''
	description = ad.get('description', {})
	description_text = description.get('text', '') if description else ''

	# Combine text fields
	text_to_embed = f"{headline} {occupation_label} {description_text}".strip()

	# If we have no text to embed, raise an exception
	if not text_to_embed:
	raise ValueError("No text content available for embedding")

	return self.model.encode(text_to_embed).tolist()
	except Exception as e:
	log.error(f"Error creating embedding for ad {ad.get('id', 'unknown')}: {str(e)}")
	raise

	def _prepare_metadata(self, ad: Dict[str, Any]) -> Dict[str, str]:
	"""Extract metadata from ad for storage"""
	try:
	# Safely get nested values with fallbacks
	application_details = ad.get('application_details', {}) or {}
	workplace_address = ad.get('workplace_address', {}) or {}
	occupation = ad.get('occupation', {}) or {}
	description = ad.get('description', {}) or {}

	# Limit the size of text fields and handle potential None values
	return {
	'email': (application_details.get('email', '') or '')[:100],
	'city': (workplace_address.get('municipality', '') or '')[:100],
	'occupation': (occupation.get('label', '') or '')[:100],
	'headline': (ad.get('headline', '') or '')[:200],
	'description': (description.get('text', '') or '')[:2000],
	'logo_url': (ad.get('logo_url', '') or '')[:200],
	'webpage_url': (ad.get('webpage_url', '') or '')[:200],
	'published': (ad.get('publication_date', '') or '')[:50]
	}
	except Exception as e:
	log.error(f"Error preparing metadata for ad {ad.get('id', 'unknown')}: {str(e)}")
	raise

	def _batch_upsert(self, vectors: List[tuple]) -> None:
	"""
	Upsert a batch of vectors to Pinecone

	Args:
	vectors: List of tuples, each containing (id, vector, metadata)
	"""
	try:
	# Prepare the vectors in the format Pinecone expects
	upsert_data = [(str(id), vec, meta) for id, vec, meta in vectors]

	# Perform the upsert operation
	self.index.upsert(vectors=upsert_data)

	log.debug(f"Successfully upserted batch of {len(vectors)} vectors")
	except Exception as e:
	log.error(f"Error upserting batch: {str(e)}")
	raise

	def upsert_ads(self, ads: List[Dict[str, Any]]) -> None:
	"""Insert or update multiple ads in batches"""
	vectors = []
	deleted = 0
	processed = 0
	skipped = 0

	for ad in ads:
	try:
	# Skip None or empty ads
	if not ad:
	log.warning("Skipping None or empty ad")
	skipped += 1
	continue

	ad_id = ad.get('id')
	if not ad_id:
	log.warning("Skipping ad without ID")
	skipped += 1
	continue

	if ad.get('removed', False):
	self.delete_ad(ad_id)
	deleted += 1
	continue

	try:
	vector = self._create_embedding(ad)
	metadata = self._prepare_metadata(ad)
	vectors.append((ad_id, vector, metadata))
	processed += 1

	# When we reach batch size, upsert the batch
	if len(vectors) >= self.BATCH_SIZE:
	self._batch_upsert(vectors)
	vectors = [] # Clear the batch

	except Exception as e:
	log.error(f"Error processing ad {ad_id}: {str(e)}")
	skipped += 1

	except Exception as e:
	log.error(f"Unexpected error processing ad: {str(e)}")
	skipped += 1

	# Upsert any remaining vectors
	if vectors:
	self._batch_upsert(vectors)

	log.info(f"Processing complete: {processed} ads upserted, {deleted} deleted, {skipped} skipped")

	def delete_ad(self, ad_id: str) -> None:
	"""Delete an ad by ID"""
	try:
	self.index.delete(ids=[ad_id])
	log.debug(f"Deleted ad {ad_id} from Pinecone")
	except Exception as e:
	log.error(f"Error deleting ad {ad_id}: {str(e)}")

	def search_similar_ads(self, query: str, top_k: int = 5, city: Optional[str] = None) -> List[Dict[str, Any]]:
	"""Search for similar job ads based on text query with optional city filtering."""
	query_embedding = self.model.encode(query).tolist()

	# Build the filter dictionary if city is provided
	metadata_filter = {}
	if city:
	city = city.lower().strip() # Normalize
	city = city[0].upper() + city[1:] # Capitalize first letter
	metadata_filter["city"] = {"$eq": city}

	# Execute the Pinecone query with optional metadata filtering
	results = self.index.query(
	vector=query_embedding,
	top_k=top_k,
	include_metadata=True,
	filter=metadata_filter if metadata_filter else None
	)
	return results.matches

	def load_all(all_ads):
	handler = PineconeHandler()
	handler.upsert_ads(all_ads)

	def update(list_of_updated_ads):
	start = datetime.now()
	handler = PineconeHandler()
	handler.upsert_ads(list_of_updated_ads)
	log.info(f"{len(list_of_updated_ads)} ads processed. Time: {datetime.now() - start}")