File size: 5,986 Bytes
f91204a d74e0f6 f91204a d74e0f6 f91204a d74e0f6 f91204a d74e0f6 f91204a d74e0f6 f91204a d74e0f6 f91204a d74e0f6 f91204a d74e0f6 f91204a d74e0f6 f91204a d74e0f6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
# services/data_service.py
from typing import List, Dict, Any, Optional, Tuple
import pandas as pd
import faiss
import numpy as np
import aiohttp
from datetime import datetime
import logging
from config.config import settings
from functools import lru_cache
from io import StringIO # Add explicit StringIO import
logger = logging.getLogger(__name__)
class DataService:
def __init__(self, model_service):
self.embedder = model_service.embedder
self.cache = {}
self.last_update = None
self.faiss_index = None
self.data_cleaned = None
async def fetch_csv_data(self) -> pd.DataFrame:
"""Fetch CSV data from URL with retry logic"""
async with aiohttp.ClientSession() as session:
for attempt in range(settings.MAX_RETRIES):
try:
async with session.get(settings.CSV_URL) as response:
if response.status == 200:
content = await response.text()
return pd.read_csv(StringIO(content), sep='|')
else:
logger.error(f"Failed to fetch data: HTTP {response.status}")
except Exception as e:
logger.error(f"Attempt {attempt + 1} failed: {e}", exc_info=True)
if attempt == settings.MAX_RETRIES - 1:
raise
return pd.DataFrame() # Return empty DataFrame if all attempts fail
async def prepare_data_and_index(self) -> Tuple[pd.DataFrame, Any]:
"""Prepare data and create FAISS index with caching"""
try:
current_time = datetime.now()
# Check cache validity
if (self.last_update and
(current_time - self.last_update).seconds < settings.CACHE_DURATION and
self.cache):
return self.cache['data'], self.cache['index']
data = await self.fetch_csv_data()
if data.empty:
logger.error("Failed to fetch data")
return pd.DataFrame(), None
# Data cleaning and preparation
columns_to_keep = [
'ID', 'Name', 'Description', 'Price',
'ProductCategory', 'Grammage',
'BasePriceText', 'Rating', 'RatingCount',
'Ingredients', 'CreationDate', 'Keywords', 'Brand'
]
self.data_cleaned = data[columns_to_keep].copy()
# Clean description text
self.data_cleaned['Description'] = self.data_cleaned['Description'].astype(str).str.replace(
r'[^\w\s.,;:\'/?!€$%&()\[\]{}<>|=+\\-]', ' ', regex=True
)
# Combine text fields with weights
self.data_cleaned['combined_text'] = self.data_cleaned.apply(
lambda row: (
f"{row['Name']} {row['Name']} " # Double weight for name
f"{str(row['Description'])} "
f"{str(row['Keywords']) if pd.notnull(row['Keywords']) else ''} "
f"{str(row['ProductCategory']) if pd.notnull(row['ProductCategory']) else ''}"
).strip(),
axis=1
)
# Create FAISS index
embeddings = self.embedder.encode(
self.data_cleaned['combined_text'].tolist(),
convert_to_tensor=True,
show_progress_bar=True
).cpu().detach().numpy()
d = embeddings.shape[1]
self.faiss_index = faiss.IndexFlatL2(d)
self.faiss_index.add(embeddings)
# Update cache
self.cache = {
'data': self.data_cleaned,
'index': self.faiss_index
}
self.last_update = current_time
return self.data_cleaned, self.faiss_index
except Exception as e:
logger.error(f"Error in prepare_data_and_index: {e}", exc_info=True)
return pd.DataFrame(), None
async def search(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
"""Search for products similar to the query"""
try:
if not self.faiss_index:
self.data_cleaned, self.faiss_index = await self.prepare_data_and_index()
if self.faiss_index is None:
return []
# Create query embedding
query_embedding = self.embedder.encode([query], convert_to_tensor=True)
query_embedding_np = query_embedding.cpu().detach().numpy()
# Search in FAISS index
distances, indices = self.faiss_index.search(query_embedding_np, top_k)
# Prepare results
results = []
for i, idx in enumerate(indices[0]):
try:
product = {}
row = self.data_cleaned.iloc[idx]
for column in self.data_cleaned.columns:
value = row[column]
# Convert numpy/pandas types to Python native types
if isinstance(value, (np.integer, np.floating)):
value = value.item()
elif isinstance(value, pd.Timestamp):
value = value.isoformat()
elif isinstance(value, np.bool_):
value = bool(value)
product[column] = value
product['score'] = float(distances[0][i])
results.append(product)
except Exception as e:
logger.error(f"Error processing search result {i}: {e}", exc_info=True)
continue
return results
except Exception as e:
logger.error(f"Error in search: {e}", exc_info=True)
return [] |