Spaces:
Running
Running
from openai import AsyncOpenAI | |
import logging | |
from typing import List, Dict | |
import pandas as pd | |
import asyncio | |
from src.api.exceptions import OpenAIError | |
# Set up structured logging | |
logging.basicConfig( | |
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" | |
) | |
logger = logging.getLogger(__name__) | |
class EmbeddingService: | |
def __init__( | |
self, | |
openai_api_key: str, | |
model: str = "text-embedding-3-small", | |
batch_size: int = 100, | |
): | |
self.client = AsyncOpenAI(api_key=openai_api_key) | |
self.model = model | |
self.batch_size = batch_size | |
async def get_embedding(self, text: str) -> List[float]: | |
"""Generate embeddings for the given text using OpenAI.""" | |
text = text.replace("\n", " ") | |
try: | |
response = await self.client.embeddings.create( | |
input=[text], model=self.model | |
) | |
return response.data[0].embedding | |
except Exception as e: | |
logger.error(f"Failed to generate embedding: {e}") | |
raise OpenAIError(f"OpenAI API error: {e}") | |
async def create_embeddings( | |
self, df: pd.DataFrame, target_column: str, output_column: str | |
) -> pd.DataFrame: | |
"""Create embeddings for the target column in the dataset.""" | |
logger.info("Generating embeddings...") | |
batches = [ | |
df[i : i + self.batch_size] for i in range(0, len(df), self.batch_size) | |
] | |
processed_batches = await asyncio.gather( | |
*[ | |
self._process_batch(batch, target_column, output_column) | |
for batch in batches | |
] | |
) | |
return pd.concat(processed_batches) | |
async def _process_batch( | |
self, df_batch: pd.DataFrame, target_column: str, output_column: str | |
) -> pd.DataFrame: | |
"""Process a batch of rows to generate embeddings.""" | |
embeddings = await asyncio.gather( | |
*[self.get_embedding(row[target_column]) for _, row in df_batch.iterrows()] | |
) | |
df_batch[output_column] = embeddings | |
return df_batch | |