llm

Sleeping

llm

File size: 5,986 Bytes

# services/data_service.py
from typing import List, Dict, Any, Optional, Tuple
import pandas as pd
import faiss
import numpy as np
import aiohttp
from datetime import datetime
import logging
from config.config import settings
from functools import lru_cache
from io import StringIO  # Add explicit StringIO import

logger = logging.getLogger(__name__)

class DataService:
    def __init__(self, model_service):
        self.embedder = model_service.embedder
        self.cache = {}
        self.last_update = None
        self.faiss_index = None
        self.data_cleaned = None

    async def fetch_csv_data(self) -> pd.DataFrame:
        """Fetch CSV data from URL with retry logic"""
        async with aiohttp.ClientSession() as session:
            for attempt in range(settings.MAX_RETRIES):
                try:
                    async with session.get(settings.CSV_URL) as response:
                        if response.status == 200:
                            content = await response.text()
                            return pd.read_csv(StringIO(content), sep='|')
                        else:
                            logger.error(f"Failed to fetch data: HTTP {response.status}")
                except Exception as e:
                    logger.error(f"Attempt {attempt + 1} failed: {e}", exc_info=True)
                    if attempt == settings.MAX_RETRIES - 1:
                        raise
            return pd.DataFrame()  # Return empty DataFrame if all attempts fail

    async def prepare_data_and_index(self) -> Tuple[pd.DataFrame, Any]:
        """Prepare data and create FAISS index with caching"""
        try:
            current_time = datetime.now()
            
            # Check cache validity
            if (self.last_update and 
                (current_time - self.last_update).seconds < settings.CACHE_DURATION and
                self.cache):
                return self.cache['data'], self.cache['index']

            data = await self.fetch_csv_data()
            if data.empty:
                logger.error("Failed to fetch data")
                return pd.DataFrame(), None

            # Data cleaning and preparation
            columns_to_keep = [
                'ID', 'Name', 'Description', 'Price', 
                'ProductCategory', 'Grammage', 
                'BasePriceText', 'Rating', 'RatingCount',
                'Ingredients', 'CreationDate', 'Keywords', 'Brand'
            ]
            
            self.data_cleaned = data[columns_to_keep].copy()
            
            # Clean description text
            self.data_cleaned['Description'] = self.data_cleaned['Description'].astype(str).str.replace(
                r'[^\w\s.,;:\'/?!€$%&()\[\]{}<>|=+\\-]', ' ', regex=True
            )
            
            # Combine text fields with weights
            self.data_cleaned['combined_text'] = self.data_cleaned.apply(
                lambda row: (
                    f"{row['Name']} {row['Name']} "  # Double weight for name
                    f"{str(row['Description'])} "
                    f"{str(row['Keywords']) if pd.notnull(row['Keywords']) else ''} "
                    f"{str(row['ProductCategory']) if pd.notnull(row['ProductCategory']) else ''}"
                ).strip(),
                axis=1
            )

            # Create FAISS index
            embeddings = self.embedder.encode(
                self.data_cleaned['combined_text'].tolist(),
                convert_to_tensor=True,
                show_progress_bar=True
            ).cpu().detach().numpy()

            d = embeddings.shape[1]
            self.faiss_index = faiss.IndexFlatL2(d)
            self.faiss_index.add(embeddings)

            # Update cache
            self.cache = {
                'data': self.data_cleaned,
                'index': self.faiss_index
            }
            self.last_update = current_time

            return self.data_cleaned, self.faiss_index

        except Exception as e:
            logger.error(f"Error in prepare_data_and_index: {e}", exc_info=True)
            return pd.DataFrame(), None

    async def search(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
        """Search for products similar to the query"""
        try:
            if not self.faiss_index:
                self.data_cleaned, self.faiss_index = await self.prepare_data_and_index()
                if self.faiss_index is None:
                    return []

            # Create query embedding
            query_embedding = self.embedder.encode([query], convert_to_tensor=True)
            query_embedding_np = query_embedding.cpu().detach().numpy()

            # Search in FAISS index
            distances, indices = self.faiss_index.search(query_embedding_np, top_k)

            # Prepare results
            results = []
            for i, idx in enumerate(indices[0]):
                try:
                    product = {}
                    row = self.data_cleaned.iloc[idx]
                    for column in self.data_cleaned.columns:
                        value = row[column]
                        # Convert numpy/pandas types to Python native types
                        if isinstance(value, (np.integer, np.floating)):
                            value = value.item()
                        elif isinstance(value, pd.Timestamp):
                            value = value.isoformat()
                        elif isinstance(value, np.bool_):
                            value = bool(value)
                        product[column] = value
                    
                    product['score'] = float(distances[0][i])
                    results.append(product)
                except Exception as e:
                    logger.error(f"Error processing search result {i}: {e}", exc_info=True)
                    continue

            return results

        except Exception as e:
            logger.error(f"Error in search: {e}", exc_info=True)
            return []