File size: 5,986 Bytes
f91204a
d74e0f6
f91204a
 
 
 
 
 
 
 
d74e0f6
f91204a
 
 
 
 
 
 
 
 
 
 
 
d74e0f6
f91204a
 
 
 
 
 
d74e0f6
 
 
f91204a
d74e0f6
f91204a
 
d74e0f6
f91204a
d74e0f6
 
 
 
 
 
 
 
 
 
f91204a
d74e0f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f91204a
d74e0f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f91204a
d74e0f6
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# services/data_service.py
from typing import List, Dict, Any, Optional, Tuple
import pandas as pd
import faiss
import numpy as np
import aiohttp
from datetime import datetime
import logging
from config.config import settings
from functools import lru_cache
from io import StringIO  # Add explicit StringIO import

logger = logging.getLogger(__name__)

class DataService:
    def __init__(self, model_service):
        self.embedder = model_service.embedder
        self.cache = {}
        self.last_update = None
        self.faiss_index = None
        self.data_cleaned = None

    async def fetch_csv_data(self) -> pd.DataFrame:
        """Fetch CSV data from URL with retry logic"""
        async with aiohttp.ClientSession() as session:
            for attempt in range(settings.MAX_RETRIES):
                try:
                    async with session.get(settings.CSV_URL) as response:
                        if response.status == 200:
                            content = await response.text()
                            return pd.read_csv(StringIO(content), sep='|')
                        else:
                            logger.error(f"Failed to fetch data: HTTP {response.status}")
                except Exception as e:
                    logger.error(f"Attempt {attempt + 1} failed: {e}", exc_info=True)
                    if attempt == settings.MAX_RETRIES - 1:
                        raise
            return pd.DataFrame()  # Return empty DataFrame if all attempts fail

    async def prepare_data_and_index(self) -> Tuple[pd.DataFrame, Any]:
        """Prepare data and create FAISS index with caching"""
        try:
            current_time = datetime.now()
            
            # Check cache validity
            if (self.last_update and 
                (current_time - self.last_update).seconds < settings.CACHE_DURATION and
                self.cache):
                return self.cache['data'], self.cache['index']

            data = await self.fetch_csv_data()
            if data.empty:
                logger.error("Failed to fetch data")
                return pd.DataFrame(), None

            # Data cleaning and preparation
            columns_to_keep = [
                'ID', 'Name', 'Description', 'Price', 
                'ProductCategory', 'Grammage', 
                'BasePriceText', 'Rating', 'RatingCount',
                'Ingredients', 'CreationDate', 'Keywords', 'Brand'
            ]
            
            self.data_cleaned = data[columns_to_keep].copy()
            
            # Clean description text
            self.data_cleaned['Description'] = self.data_cleaned['Description'].astype(str).str.replace(
                r'[^\w\s.,;:\'/?!€$%&()\[\]{}<>|=+\\-]', ' ', regex=True
            )
            
            # Combine text fields with weights
            self.data_cleaned['combined_text'] = self.data_cleaned.apply(
                lambda row: (
                    f"{row['Name']} {row['Name']} "  # Double weight for name
                    f"{str(row['Description'])} "
                    f"{str(row['Keywords']) if pd.notnull(row['Keywords']) else ''} "
                    f"{str(row['ProductCategory']) if pd.notnull(row['ProductCategory']) else ''}"
                ).strip(),
                axis=1
            )

            # Create FAISS index
            embeddings = self.embedder.encode(
                self.data_cleaned['combined_text'].tolist(),
                convert_to_tensor=True,
                show_progress_bar=True
            ).cpu().detach().numpy()

            d = embeddings.shape[1]
            self.faiss_index = faiss.IndexFlatL2(d)
            self.faiss_index.add(embeddings)

            # Update cache
            self.cache = {
                'data': self.data_cleaned,
                'index': self.faiss_index
            }
            self.last_update = current_time

            return self.data_cleaned, self.faiss_index

        except Exception as e:
            logger.error(f"Error in prepare_data_and_index: {e}", exc_info=True)
            return pd.DataFrame(), None

    async def search(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
        """Search for products similar to the query"""
        try:
            if not self.faiss_index:
                self.data_cleaned, self.faiss_index = await self.prepare_data_and_index()
                if self.faiss_index is None:
                    return []

            # Create query embedding
            query_embedding = self.embedder.encode([query], convert_to_tensor=True)
            query_embedding_np = query_embedding.cpu().detach().numpy()

            # Search in FAISS index
            distances, indices = self.faiss_index.search(query_embedding_np, top_k)

            # Prepare results
            results = []
            for i, idx in enumerate(indices[0]):
                try:
                    product = {}
                    row = self.data_cleaned.iloc[idx]
                    for column in self.data_cleaned.columns:
                        value = row[column]
                        # Convert numpy/pandas types to Python native types
                        if isinstance(value, (np.integer, np.floating)):
                            value = value.item()
                        elif isinstance(value, pd.Timestamp):
                            value = value.isoformat()
                        elif isinstance(value, np.bool_):
                            value = bool(value)
                        product[column] = value
                    
                    product['score'] = float(distances[0][i])
                    results.append(product)
                except Exception as e:
                    logger.error(f"Error processing search result {i}: {e}", exc_info=True)
                    continue

            return results

        except Exception as e:
            logger.error(f"Error in search: {e}", exc_info=True)
            return []