Spaces:
Running
Running
File size: 8,365 Bytes
6e54bce e372c76 0cb7604 e372c76 6e54bce da21d3b 6e54bce da21d3b 6e54bce f0678ae ba6957d 6e54bce f0678ae 6e54bce 0cb7604 6e54bce 0cb7604 6e54bce 0cb7604 6e54bce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 |
from datetime import datetime
import sys
import logging
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer
from typing import List, Dict, Any
import os
from dotenv import load_dotenv
from typing import List, Dict, Any, Optional
load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
if not PINECONE_API_KEY:
raise ValueError("PINECONE_API_KEY is not set. Please check your environment or secrets configuration.")
from settings import (
LOG_LEVEL,
LOG_DATE_FORMAT,
LOG_FORMAT,
PINECONE_ENVIRONMENT,
PINECONE_INDEX_NAME
)
log = logging.getLogger(__name__)
logging.basicConfig(stream=sys.stdout, level=LOG_LEVEL, format=LOG_FORMAT, datefmt=LOG_DATE_FORMAT)
class PineconeHandler:
"""
Handles connections and operations with Pinecone vector database
for storing and retrieving job ads
"""
def __init__(self):
self.pc = Pinecone(api_key=PINECONE_API_KEY)
self.BATCH_SIZE = 100 # Number of vectors to upsert at once
try:
self.index = self.pc.Index(PINECONE_INDEX_NAME)
log.info(f"Connected to existing index '{PINECONE_INDEX_NAME}'")
except Exception as e:
log.info(f"Creating new index '{PINECONE_INDEX_NAME}'")
spec = ServerlessSpec(
cloud="aws",
region="us-east-1"
)
self.pc.create_index(
name=PINECONE_INDEX_NAME,
dimension=384,
metric="cosine",
spec=spec
)
self.index = self.pc.Index(PINECONE_INDEX_NAME)
#self.model = SentenceTransformer('all-MiniLM-L6-v2')
#self.model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
self.model = SentenceTransformer('forestav/job_matching_sentence_transformer')
log.info(f"Initialized connection to Pinecone index '{PINECONE_INDEX_NAME}'")
def _create_embedding(self, ad: Dict[str, Any]) -> List[float]:
"""Create embedding from job ad text"""
try:
# Safely get text fields with fallbacks to empty string
headline = ad.get('headline', '') or ''
occupation = ad.get('occupation', {})
occupation_label = occupation.get('label', '') if occupation else ''
description = ad.get('description', {})
description_text = description.get('text', '') if description else ''
# Combine text fields
text_to_embed = f"{headline} {occupation_label} {description_text}".strip()
# If we have no text to embed, raise an exception
if not text_to_embed:
raise ValueError("No text content available for embedding")
return self.model.encode(text_to_embed).tolist()
except Exception as e:
log.error(f"Error creating embedding for ad {ad.get('id', 'unknown')}: {str(e)}")
raise
def _prepare_metadata(self, ad: Dict[str, Any]) -> Dict[str, str]:
"""Extract metadata from ad for storage"""
try:
# Safely get nested values with fallbacks
application_details = ad.get('application_details', {}) or {}
workplace_address = ad.get('workplace_address', {}) or {}
occupation = ad.get('occupation', {}) or {}
description = ad.get('description', {}) or {}
# Limit the size of text fields and handle potential None values
return {
'email': (application_details.get('email', '') or '')[:100],
'city': (workplace_address.get('municipality', '') or '')[:100],
'occupation': (occupation.get('label', '') or '')[:100],
'headline': (ad.get('headline', '') or '')[:200],
'description': (description.get('text', '') or '')[:2000],
'logo_url': (ad.get('logo_url', '') or '')[:200],
'webpage_url': (ad.get('webpage_url', '') or '')[:200],
'published': (ad.get('publication_date', '') or '')[:50]
}
except Exception as e:
log.error(f"Error preparing metadata for ad {ad.get('id', 'unknown')}: {str(e)}")
raise
def _batch_upsert(self, vectors: List[tuple]) -> None:
"""
Upsert a batch of vectors to Pinecone
Args:
vectors: List of tuples, each containing (id, vector, metadata)
"""
try:
# Prepare the vectors in the format Pinecone expects
upsert_data = [(str(id), vec, meta) for id, vec, meta in vectors]
# Perform the upsert operation
self.index.upsert(vectors=upsert_data)
log.debug(f"Successfully upserted batch of {len(vectors)} vectors")
except Exception as e:
log.error(f"Error upserting batch: {str(e)}")
raise
def upsert_ads(self, ads: List[Dict[str, Any]]) -> None:
"""Insert or update multiple ads in batches"""
vectors = []
deleted = 0
processed = 0
skipped = 0
for ad in ads:
try:
# Skip None or empty ads
if not ad:
log.warning("Skipping None or empty ad")
skipped += 1
continue
ad_id = ad.get('id')
if not ad_id:
log.warning("Skipping ad without ID")
skipped += 1
continue
if ad.get('removed', False):
self.delete_ad(ad_id)
deleted += 1
continue
try:
vector = self._create_embedding(ad)
metadata = self._prepare_metadata(ad)
vectors.append((ad_id, vector, metadata))
processed += 1
# When we reach batch size, upsert the batch
if len(vectors) >= self.BATCH_SIZE:
self._batch_upsert(vectors)
vectors = [] # Clear the batch
except Exception as e:
log.error(f"Error processing ad {ad_id}: {str(e)}")
skipped += 1
except Exception as e:
log.error(f"Unexpected error processing ad: {str(e)}")
skipped += 1
# Upsert any remaining vectors
if vectors:
self._batch_upsert(vectors)
log.info(f"Processing complete: {processed} ads upserted, {deleted} deleted, {skipped} skipped")
def delete_ad(self, ad_id: str) -> None:
"""Delete an ad by ID"""
try:
self.index.delete(ids=[ad_id])
log.debug(f"Deleted ad {ad_id} from Pinecone")
except Exception as e:
log.error(f"Error deleting ad {ad_id}: {str(e)}")
def search_similar_ads(self, query: str, top_k: int = 5, city: Optional[str] = None) -> List[Dict[str, Any]]:
"""Search for similar job ads based on text query with optional city filtering."""
query_embedding = self.model.encode(query).tolist()
# Build the filter dictionary if city is provided
metadata_filter = {}
if city:
city = city.lower().strip() # Normalize
city = city[0].upper() + city[1:] # Capitalize first letter
metadata_filter["city"] = {"$eq": city}
# Execute the Pinecone query with optional metadata filtering
results = self.index.query(
vector=query_embedding,
top_k=top_k,
include_metadata=True,
filter=metadata_filter if metadata_filter else None
)
return results.matches
def load_all(all_ads):
handler = PineconeHandler()
handler.upsert_ads(all_ads)
def update(list_of_updated_ads):
start = datetime.now()
handler = PineconeHandler()
handler.upsert_ads(list_of_updated_ads)
log.info(f"{len(list_of_updated_ads)} ads processed. Time: {datetime.now() - start}") |