amaye15 commited on
Commit
0611c31
·
1 Parent(s): abfb1fb

Feat - Use huggingface dataset instead of pandas

Browse files
src/api/services/embedding_service.py CHANGED
@@ -1,7 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from openai import AsyncOpenAI
2
  import logging
3
  from typing import List, Dict, Union
4
- import pandas as pd
5
  import asyncio
6
  from src.api.exceptions import OpenAIError
7
 
@@ -44,53 +173,53 @@ class EmbeddingService:
44
 
45
  async def create_embeddings(
46
  self,
47
- data: Union[pd.DataFrame, List[str]],
48
  target_column: str = None,
49
  output_column: str = "embeddings",
50
- ) -> Union[pd.DataFrame, List[List[float]]]:
51
  """
52
- Create embeddings for either a DataFrame or a list of strings.
53
 
54
  Args:
55
- data: Either a DataFrame or a list of strings.
56
- target_column: The column in the DataFrame to generate embeddings for (required if data is a DataFrame).
57
- output_column: The column to store embeddings in the DataFrame (default: "embeddings").
58
 
59
  Returns:
60
- If data is a DataFrame, returns the DataFrame with the embeddings column.
61
  If data is a list of strings, returns a list of embeddings.
62
  """
63
- if isinstance(data, pd.DataFrame):
64
  if not target_column:
65
- raise ValueError("target_column is required when data is a DataFrame.")
66
- return await self._create_embeddings_for_dataframe(
67
  data, target_column, output_column
68
  )
69
  elif isinstance(data, list):
70
  return await self._create_embeddings_for_texts(data)
71
  else:
72
  raise TypeError(
73
- "data must be either a pandas DataFrame or a list of strings."
74
  )
75
 
76
- async def _create_embeddings_for_dataframe(
77
- self, df: pd.DataFrame, target_column: str, output_column: str
78
- ) -> pd.DataFrame:
79
- """Create embeddings for the target column in the DataFrame."""
80
- logger.info("Generating embeddings for DataFrame...")
81
- self.total_requests = len(df) # Set total number of requests
82
  self.completed_requests = 0 # Reset completed requests counter
83
 
84
- batches = [
85
- df[i : i + self.batch_size] for i in range(0, len(df), self.batch_size)
86
- ]
87
- processed_batches = await asyncio.gather(
88
- *[
89
- self._process_batch(batch, target_column, output_column)
90
- for batch in batches
91
- ]
92
- )
93
- return pd.concat(processed_batches)
94
 
95
  async def _create_embeddings_for_texts(self, texts: List[str]) -> List[List[float]]:
96
  """Create embeddings for a list of strings."""
@@ -110,16 +239,6 @@ class EmbeddingService:
110
  embeddings.extend(batch_embeddings)
111
  return embeddings
112
 
113
- async def _process_batch(
114
- self, df_batch: pd.DataFrame, target_column: str, output_column: str
115
- ) -> pd.DataFrame:
116
- """Process a batch of rows to generate embeddings."""
117
- embeddings = await asyncio.gather(
118
- *[self.get_embedding(row[target_column]) for _, row in df_batch.iterrows()]
119
- )
120
- df_batch[output_column] = embeddings
121
- return df_batch
122
-
123
  def _log_progress(self):
124
  """Log the progress of embedding generation."""
125
  progress = (self.completed_requests / self.total_requests) * 100
 
1
+ # from openai import AsyncOpenAI
2
+ # import logging
3
+ # from typing import List, Dict, Union
4
+ # import pandas as pd
5
+ # import asyncio
6
+ # from src.api.exceptions import OpenAIError
7
+
8
+ # # Set up structured logging
9
+ # logging.basicConfig(
10
+ # level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
11
+ # )
12
+ # logger = logging.getLogger(__name__)
13
+
14
+
15
+ # class EmbeddingService:
16
+ # def __init__(
17
+ # self,
18
+ # openai_api_key: str,
19
+ # model: str = "text-embedding-3-small",
20
+ # batch_size: int = 10,
21
+ # max_concurrent_requests: int = 10, # Limit to 10 concurrent requests
22
+ # ):
23
+ # self.client = AsyncOpenAI(api_key=openai_api_key)
24
+ # self.model = model
25
+ # self.batch_size = batch_size
26
+ # self.semaphore = asyncio.Semaphore(max_concurrent_requests) # Rate limiter
27
+ # self.total_requests = 0 # Total number of requests to process
28
+ # self.completed_requests = 0 # Number of completed requests
29
+
30
+ # async def get_embedding(self, text: str) -> List[float]:
31
+ # """Generate embeddings for the given text using OpenAI."""
32
+ # text = text.replace("\n", " ")
33
+ # try:
34
+ # async with self.semaphore: # Acquire a semaphore slot
35
+ # response = await self.client.embeddings.create(
36
+ # input=[text], model=self.model
37
+ # )
38
+ # self.completed_requests += 1 # Increment completed requests
39
+ # self._log_progress() # Log progress
40
+ # return response.data[0].embedding
41
+ # except Exception as e:
42
+ # logger.error(f"Failed to generate embedding: {e}")
43
+ # raise OpenAIError(f"OpenAI API error: {e}")
44
+
45
+ # async def create_embeddings(
46
+ # self,
47
+ # data: Union[pd.DataFrame, List[str]],
48
+ # target_column: str = None,
49
+ # output_column: str = "embeddings",
50
+ # ) -> Union[pd.DataFrame, List[List[float]]]:
51
+ # """
52
+ # Create embeddings for either a DataFrame or a list of strings.
53
+
54
+ # Args:
55
+ # data: Either a DataFrame or a list of strings.
56
+ # target_column: The column in the DataFrame to generate embeddings for (required if data is a DataFrame).
57
+ # output_column: The column to store embeddings in the DataFrame (default: "embeddings").
58
+
59
+ # Returns:
60
+ # If data is a DataFrame, returns the DataFrame with the embeddings column.
61
+ # If data is a list of strings, returns a list of embeddings.
62
+ # """
63
+ # if isinstance(data, pd.DataFrame):
64
+ # if not target_column:
65
+ # raise ValueError("target_column is required when data is a DataFrame.")
66
+ # return await self._create_embeddings_for_dataframe(
67
+ # data, target_column, output_column
68
+ # )
69
+ # elif isinstance(data, list):
70
+ # return await self._create_embeddings_for_texts(data)
71
+ # else:
72
+ # raise TypeError(
73
+ # "data must be either a pandas DataFrame or a list of strings."
74
+ # )
75
+
76
+ # async def _create_embeddings_for_dataframe(
77
+ # self, df: pd.DataFrame, target_column: str, output_column: str
78
+ # ) -> pd.DataFrame:
79
+ # """Create embeddings for the target column in the DataFrame."""
80
+ # logger.info("Generating embeddings for DataFrame...")
81
+ # self.total_requests = len(df) # Set total number of requests
82
+ # self.completed_requests = 0 # Reset completed requests counter
83
+
84
+ # batches = [
85
+ # df[i : i + self.batch_size] for i in range(0, len(df), self.batch_size)
86
+ # ]
87
+ # processed_batches = await asyncio.gather(
88
+ # *[
89
+ # self._process_batch(batch, target_column, output_column)
90
+ # for batch in batches
91
+ # ]
92
+ # )
93
+ # return pd.concat(processed_batches)
94
+
95
+ # async def _create_embeddings_for_texts(self, texts: List[str]) -> List[List[float]]:
96
+ # """Create embeddings for a list of strings."""
97
+ # logger.info("Generating embeddings for list of texts...")
98
+ # self.total_requests = len(texts) # Set total number of requests
99
+ # self.completed_requests = 0 # Reset completed requests counter
100
+
101
+ # batches = [
102
+ # texts[i : i + self.batch_size]
103
+ # for i in range(0, len(texts), self.batch_size)
104
+ # ]
105
+ # embeddings = []
106
+ # for batch in batches:
107
+ # batch_embeddings = await asyncio.gather(
108
+ # *[self.get_embedding(text) for text in batch]
109
+ # )
110
+ # embeddings.extend(batch_embeddings)
111
+ # return embeddings
112
+
113
+ # async def _process_batch(
114
+ # self, df_batch: pd.DataFrame, target_column: str, output_column: str
115
+ # ) -> pd.DataFrame:
116
+ # """Process a batch of rows to generate embeddings."""
117
+ # embeddings = await asyncio.gather(
118
+ # *[self.get_embedding(row[target_column]) for _, row in df_batch.iterrows()]
119
+ # )
120
+ # df_batch[output_column] = embeddings
121
+ # return df_batch
122
+
123
+ # def _log_progress(self):
124
+ # """Log the progress of embedding generation."""
125
+ # progress = (self.completed_requests / self.total_requests) * 100
126
+ # logger.info(
127
+ # f"Progress: {self.completed_requests}/{self.total_requests} ({progress:.2f}%)"
128
+ # )
129
+
130
  from openai import AsyncOpenAI
131
  import logging
132
  from typing import List, Dict, Union
133
+ from datasets import Dataset
134
  import asyncio
135
  from src.api.exceptions import OpenAIError
136
 
 
173
 
174
  async def create_embeddings(
175
  self,
176
+ data: Union[Dataset, List[str]],
177
  target_column: str = None,
178
  output_column: str = "embeddings",
179
+ ) -> Union[Dataset, List[List[float]]]:
180
  """
181
+ Create embeddings for either a Dataset or a list of strings.
182
 
183
  Args:
184
+ data: Either a Dataset or a list of strings.
185
+ target_column: The column in the Dataset to generate embeddings for (required if data is a Dataset).
186
+ output_column: The column to store embeddings in the Dataset (default: "embeddings").
187
 
188
  Returns:
189
+ If data is a Dataset, returns the Dataset with the embeddings column.
190
  If data is a list of strings, returns a list of embeddings.
191
  """
192
+ if isinstance(data, Dataset):
193
  if not target_column:
194
+ raise ValueError("target_column is required when data is a Dataset.")
195
+ return await self._create_embeddings_for_dataset(
196
  data, target_column, output_column
197
  )
198
  elif isinstance(data, list):
199
  return await self._create_embeddings_for_texts(data)
200
  else:
201
  raise TypeError(
202
+ "data must be either a Hugging Face Dataset or a list of strings."
203
  )
204
 
205
+ async def _create_embeddings_for_dataset(
206
+ self, dataset: Dataset, target_column: str, output_column: str
207
+ ) -> Dataset:
208
+ """Create embeddings for the target column in the Dataset."""
209
+ logger.info("Generating embeddings for Dataset...")
210
+ self.total_requests = len(dataset) # Set total number of requests
211
  self.completed_requests = 0 # Reset completed requests counter
212
 
213
+ embeddings = []
214
+ for i in range(0, len(dataset), self.batch_size):
215
+ batch = dataset[i : i + self.batch_size]
216
+ batch_embeddings = await asyncio.gather(
217
+ *[self.get_embedding(text) for text in batch[target_column]]
218
+ )
219
+ embeddings.extend(batch_embeddings)
220
+
221
+ dataset = dataset.add_column(output_column, embeddings)
222
+ return dataset
223
 
224
  async def _create_embeddings_for_texts(self, texts: List[str]) -> List[List[float]]:
225
  """Create embeddings for a list of strings."""
 
239
  embeddings.extend(batch_embeddings)
240
  return embeddings
241
 
 
 
 
 
 
 
 
 
 
 
242
  def _log_progress(self):
243
  """Log the progress of embedding generation."""
244
  progress = (self.completed_requests / self.total_requests) * 100
src/api/services/huggingface_service.py CHANGED
@@ -1,9 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from datasets import Dataset, load_dataset, concatenate_datasets
2
  from huggingface_hub import HfApi, HfFolder
3
  import logging
4
  import os
5
  from typing import Optional, Dict, List
6
- import pandas as pd
7
  from src.api.services.embedding_service import EmbeddingService
8
  from src.api.exceptions import (
9
  DatasetNotFoundError,
@@ -25,24 +127,22 @@ class HuggingFaceService:
25
  if hf_token:
26
  HfFolder.save_token(hf_token) # Save the token for authentication
27
 
28
- async def push_to_hub(self, df: pd.DataFrame, dataset_name: str) -> None:
29
  """Push the dataset to Hugging Face Hub."""
30
  try:
31
  logger.info(f"Creating Hugging Face Dataset: {dataset_name}...")
32
- ds = Dataset.from_pandas(df)
33
- ds.push_to_hub(dataset_name)
34
  logger.info(f"Dataset pushed to Hugging Face Hub: {dataset_name}")
35
  except Exception as e:
36
  logger.error(f"Failed to push dataset to Hugging Face Hub: {e}")
37
  raise DatasetPushError(f"Failed to push dataset: {e}")
38
 
39
- async def read_dataset(self, dataset_name: str) -> Optional[pd.DataFrame]:
40
  """Read a dataset from Hugging Face Hub."""
41
  try:
42
  logger.info(f"Loading dataset from Hugging Face Hub: {dataset_name}...")
43
- ds = load_dataset(dataset_name)
44
- df = ds["train"].to_dict()
45
- return df
46
  except Exception as e:
47
  logger.error(f"Failed to read dataset: {e}")
48
  raise DatasetNotFoundError(f"Dataset not found: {e}")
@@ -53,40 +153,39 @@ class HuggingFaceService:
53
  updates: Dict[str, List],
54
  target_column: str,
55
  output_column: str = "embeddings",
56
- ) -> Optional[pd.DataFrame]:
57
  """Update a dataset on Hugging Face Hub by generating embeddings for new data and concatenating it with the existing dataset."""
58
  try:
59
  # Step 1: Load the existing dataset from Hugging Face Hub
60
  logger.info(
61
  f"Loading existing dataset from Hugging Face Hub: {dataset_name}..."
62
  )
63
- existing_ds = await self.read_dataset(dataset_name)
64
- existing_df = pd.DataFrame(existing_ds)
65
 
66
- # Step 2: Convert the new updates into a DataFrame
67
- logger.info("Converting updates to DataFrame...")
68
- new_df = pd.DataFrame(updates)
69
 
70
  # Step 3: Generate embeddings for the new data
71
  logger.info("Generating embeddings for the new data...")
72
  embedding_service = EmbeddingService(
73
  openai_api_key=os.getenv("OPENAI_API_KEY")
74
  ) # Get the embedding service
75
- new_df = await embedding_service.create_embeddings(
76
- new_df, target_column, output_column
77
  )
78
 
79
- # Step 4: Concatenate the existing DataFrame with the new DataFrame
80
  logger.info("Concatenating existing dataset with new data...")
81
- updated_df = pd.concat([existing_df, new_df], ignore_index=True)
82
 
83
  # Step 5: Push the updated dataset back to Hugging Face Hub
84
  logger.info(
85
  f"Pushing updated dataset to Hugging Face Hub: {dataset_name}..."
86
  )
87
- await self.push_to_hub(updated_df, dataset_name)
88
 
89
- return updated_df
90
  except Exception as e:
91
  logger.error(f"Failed to update dataset: {e}")
92
  raise DatasetPushError(f"Failed to update dataset: {e}")
 
1
+ # from datasets import Dataset, load_dataset, concatenate_datasets
2
+ # from huggingface_hub import HfApi, HfFolder
3
+ # import logging
4
+ # import os
5
+ # from typing import Optional, Dict, List
6
+ # import pandas as pd
7
+ # from src.api.services.embedding_service import EmbeddingService
8
+ # from src.api.exceptions import (
9
+ # DatasetNotFoundError,
10
+ # DatasetPushError,
11
+ # DatasetDeleteError,
12
+ # )
13
+
14
+ # # Set up structured logging
15
+ # logging.basicConfig(
16
+ # level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
17
+ # )
18
+ # logger = logging.getLogger(__name__)
19
+
20
+
21
+ # class HuggingFaceService:
22
+ # def __init__(self, hf_token: Optional[str] = None):
23
+ # """Initialize the HuggingFaceService with an optional token."""
24
+ # self.hf_api = HfApi()
25
+ # if hf_token:
26
+ # HfFolder.save_token(hf_token) # Save the token for authentication
27
+
28
+ # async def push_to_hub(self, df: pd.DataFrame, dataset_name: str) -> None:
29
+ # """Push the dataset to Hugging Face Hub."""
30
+ # try:
31
+ # logger.info(f"Creating Hugging Face Dataset: {dataset_name}...")
32
+ # ds = Dataset.from_pandas(df)
33
+ # ds.push_to_hub(dataset_name)
34
+ # logger.info(f"Dataset pushed to Hugging Face Hub: {dataset_name}")
35
+ # except Exception as e:
36
+ # logger.error(f"Failed to push dataset to Hugging Face Hub: {e}")
37
+ # raise DatasetPushError(f"Failed to push dataset: {e}")
38
+
39
+ # async def read_dataset(self, dataset_name: str) -> Optional[pd.DataFrame]:
40
+ # """Read a dataset from Hugging Face Hub."""
41
+ # try:
42
+ # logger.info(f"Loading dataset from Hugging Face Hub: {dataset_name}...")
43
+ # ds = load_dataset(dataset_name)
44
+ # df = ds["train"].to_dict()
45
+ # return df
46
+ # except Exception as e:
47
+ # logger.error(f"Failed to read dataset: {e}")
48
+ # raise DatasetNotFoundError(f"Dataset not found: {e}")
49
+
50
+ # async def update_dataset(
51
+ # self,
52
+ # dataset_name: str,
53
+ # updates: Dict[str, List],
54
+ # target_column: str,
55
+ # output_column: str = "embeddings",
56
+ # ) -> Optional[pd.DataFrame]:
57
+ # """Update a dataset on Hugging Face Hub by generating embeddings for new data and concatenating it with the existing dataset."""
58
+ # try:
59
+ # # Step 1: Load the existing dataset from Hugging Face Hub
60
+ # logger.info(
61
+ # f"Loading existing dataset from Hugging Face Hub: {dataset_name}..."
62
+ # )
63
+ # existing_ds = await self.read_dataset(dataset_name)
64
+ # existing_df = pd.DataFrame(existing_ds)
65
+
66
+ # # Step 2: Convert the new updates into a DataFrame
67
+ # logger.info("Converting updates to DataFrame...")
68
+ # new_df = pd.DataFrame(updates)
69
+
70
+ # # Step 3: Generate embeddings for the new data
71
+ # logger.info("Generating embeddings for the new data...")
72
+ # embedding_service = EmbeddingService(
73
+ # openai_api_key=os.getenv("OPENAI_API_KEY")
74
+ # ) # Get the embedding service
75
+ # new_df = await embedding_service.create_embeddings(
76
+ # new_df, target_column, output_column
77
+ # )
78
+
79
+ # # Step 4: Concatenate the existing DataFrame with the new DataFrame
80
+ # logger.info("Concatenating existing dataset with new data...")
81
+ # updated_df = pd.concat([existing_df, new_df], ignore_index=True)
82
+
83
+ # # Step 5: Push the updated dataset back to Hugging Face Hub
84
+ # logger.info(
85
+ # f"Pushing updated dataset to Hugging Face Hub: {dataset_name}..."
86
+ # )
87
+ # await self.push_to_hub(updated_df, dataset_name)
88
+
89
+ # return updated_df
90
+ # except Exception as e:
91
+ # logger.error(f"Failed to update dataset: {e}")
92
+ # raise DatasetPushError(f"Failed to update dataset: {e}")
93
+
94
+ # async def delete_dataset(self, dataset_name: str) -> None:
95
+ # """Delete a dataset from Hugging Face Hub."""
96
+ # try:
97
+ # logger.info(f"Deleting dataset from Hugging Face Hub: {dataset_name}...")
98
+ # self.hf_api.delete_repo(repo_id=dataset_name, repo_type="dataset")
99
+ # logger.info(f"Dataset deleted from Hugging Face Hub: {dataset_name}")
100
+ # except Exception as e:
101
+ # logger.error(f"Failed to delete dataset: {e}")
102
+ # raise DatasetDeleteError(f"Failed to delete dataset: {e}")
103
+
104
  from datasets import Dataset, load_dataset, concatenate_datasets
105
  from huggingface_hub import HfApi, HfFolder
106
  import logging
107
  import os
108
  from typing import Optional, Dict, List
 
109
  from src.api.services.embedding_service import EmbeddingService
110
  from src.api.exceptions import (
111
  DatasetNotFoundError,
 
127
  if hf_token:
128
  HfFolder.save_token(hf_token) # Save the token for authentication
129
 
130
+ async def push_to_hub(self, dataset: Dataset, dataset_name: str) -> None:
131
  """Push the dataset to Hugging Face Hub."""
132
  try:
133
  logger.info(f"Creating Hugging Face Dataset: {dataset_name}...")
134
+ dataset.push_to_hub(dataset_name)
 
135
  logger.info(f"Dataset pushed to Hugging Face Hub: {dataset_name}")
136
  except Exception as e:
137
  logger.error(f"Failed to push dataset to Hugging Face Hub: {e}")
138
  raise DatasetPushError(f"Failed to push dataset: {e}")
139
 
140
+ async def read_dataset(self, dataset_name: str) -> Optional[Dataset]:
141
  """Read a dataset from Hugging Face Hub."""
142
  try:
143
  logger.info(f"Loading dataset from Hugging Face Hub: {dataset_name}...")
144
+ dataset = load_dataset(dataset_name)
145
+ return dataset["train"]
 
146
  except Exception as e:
147
  logger.error(f"Failed to read dataset: {e}")
148
  raise DatasetNotFoundError(f"Dataset not found: {e}")
 
153
  updates: Dict[str, List],
154
  target_column: str,
155
  output_column: str = "embeddings",
156
+ ) -> Optional[Dataset]:
157
  """Update a dataset on Hugging Face Hub by generating embeddings for new data and concatenating it with the existing dataset."""
158
  try:
159
  # Step 1: Load the existing dataset from Hugging Face Hub
160
  logger.info(
161
  f"Loading existing dataset from Hugging Face Hub: {dataset_name}..."
162
  )
163
+ existing_dataset = await self.read_dataset(dataset_name)
 
164
 
165
+ # Step 2: Convert the new updates into a Dataset
166
+ logger.info("Converting updates to Dataset...")
167
+ new_dataset = Dataset.from_dict(updates)
168
 
169
  # Step 3: Generate embeddings for the new data
170
  logger.info("Generating embeddings for the new data...")
171
  embedding_service = EmbeddingService(
172
  openai_api_key=os.getenv("OPENAI_API_KEY")
173
  ) # Get the embedding service
174
+ new_dataset = await embedding_service.create_embeddings(
175
+ new_dataset, target_column, output_column
176
  )
177
 
178
+ # Step 4: Concatenate the existing Dataset with the new Dataset
179
  logger.info("Concatenating existing dataset with new data...")
180
+ updated_dataset = concatenate_datasets([existing_dataset, new_dataset])
181
 
182
  # Step 5: Push the updated dataset back to Hugging Face Hub
183
  logger.info(
184
  f"Pushing updated dataset to Hugging Face Hub: {dataset_name}..."
185
  )
186
+ await self.push_to_hub(updated_dataset, dataset_name)
187
 
188
+ return updated_dataset
189
  except Exception as e:
190
  logger.error(f"Failed to update dataset: {e}")
191
  raise DatasetPushError(f"Failed to update dataset: {e}")
src/main.py CHANGED
@@ -1,9 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  from fastapi import FastAPI, Depends, HTTPException
3
  from fastapi.responses import JSONResponse, RedirectResponse
4
  from fastapi.middleware.gzip import GZipMiddleware
5
  from pydantic import BaseModel
6
  from typing import List, Dict
 
7
  from src.api.models.embedding_models import (
8
  CreateEmbeddingRequest,
9
  ReadEmbeddingRequest,
@@ -16,8 +266,6 @@ from src.api.services.embedding_service import EmbeddingService
16
  from src.api.services.huggingface_service import HuggingFaceService
17
  from src.api.exceptions import DatasetNotFoundError, DatasetPushError, OpenAIError
18
 
19
- # from src.api.dependency import get_embedding_service, get_huggingface_service
20
- import pandas as pd
21
  import logging
22
  from dotenv import load_dotenv
23
 
@@ -118,21 +366,21 @@ async def create_embedding(
118
  # Step 1: Query the database
119
  logger.info("Fetching data from the database...")
120
  result = await db.fetch(request.query)
121
- df = pd.DataFrame(result)
122
 
123
  # Step 2: Generate embeddings
124
- df = await embedding_service.create_embeddings(
125
- df, request.target_column, request.output_column
126
  )
127
 
128
  # Step 3: Push to Hugging Face Hub
129
- await huggingface_service.push_to_hub(df, request.dataset_name)
130
 
131
  return JSONResponse(
132
  content={
133
  "message": "Embeddings created and pushed to Hugging Face Hub.",
134
  "dataset_name": request.dataset_name,
135
- "num_rows": len(df),
136
  }
137
  )
138
  except QueryExecutionError as e:
@@ -159,8 +407,8 @@ async def read_embeddings(
159
  Read embeddings from a Hugging Face dataset.
160
  """
161
  try:
162
- df = await huggingface_service.read_dataset(request.dataset_name)
163
- return df
164
  except DatasetNotFoundError as e:
165
  logger.error(f"Dataset not found: {e}")
166
  raise HTTPException(status_code=404, detail=f"Dataset not found: {e}")
@@ -170,30 +418,6 @@ async def read_embeddings(
170
 
171
 
172
  # Endpoint to update embeddings
173
- # @app.post("/update_embeddings")
174
- # async def update_embeddings(
175
- # request: UpdateEmbeddingRequest,
176
- # huggingface_service: HuggingFaceService = Depends(get_huggingface_service),
177
- # ):
178
- # """
179
- # Update embeddings in a Hugging Face dataset.
180
- # """
181
- # try:
182
- # df = await huggingface_service.update_dataset(
183
- # request.dataset_name, request.updates
184
- # )
185
- # return {
186
- # "message": "Embeddings updated successfully.",
187
- # "dataset_name": request.dataset_name,
188
- # }
189
- # except DatasetPushError as e:
190
- # logger.error(f"Failed to update dataset: {e}")
191
- # raise HTTPException(status_code=500, detail=f"Failed to update dataset: {e}")
192
- # except Exception as e:
193
- # logger.error(f"An error occurred: {e}")
194
- # raise HTTPException(status_code=500, detail=f"An error occurred: {e}")
195
-
196
-
197
  @app.post("/update_embeddings")
198
  async def update_embeddings(
199
  request: UpdateEmbeddingRequest,
@@ -204,7 +428,7 @@ async def update_embeddings(
204
  """
205
  try:
206
  # Call the update_dataset method to generate embeddings, concatenate, and push the updated dataset
207
- updated_df = await huggingface_service.update_dataset(
208
  request.dataset_name,
209
  request.updates,
210
  request.target_column,
@@ -214,7 +438,7 @@ async def update_embeddings(
214
  return {
215
  "message": "Embeddings updated successfully.",
216
  "dataset_name": request.dataset_name,
217
- "num_rows": len(updated_df),
218
  }
219
  except DatasetPushError as e:
220
  logger.error(f"Failed to update dataset: {e}")
 
1
+ # import os
2
+ # from fastapi import FastAPI, Depends, HTTPException
3
+ # from fastapi.responses import JSONResponse, RedirectResponse
4
+ # from fastapi.middleware.gzip import GZipMiddleware
5
+ # from pydantic import BaseModel
6
+ # from typing import List, Dict
7
+ # from src.api.models.embedding_models import (
8
+ # CreateEmbeddingRequest,
9
+ # ReadEmbeddingRequest,
10
+ # UpdateEmbeddingRequest,
11
+ # DeleteEmbeddingRequest,
12
+ # EmbedRequest,
13
+ # )
14
+ # from src.api.database import get_db, Database, QueryExecutionError, HealthCheckError
15
+ # from src.api.services.embedding_service import EmbeddingService
16
+ # from src.api.services.huggingface_service import HuggingFaceService
17
+ # from src.api.exceptions import DatasetNotFoundError, DatasetPushError, OpenAIError
18
+
19
+ # # from src.api.dependency import get_embedding_service, get_huggingface_service
20
+ # import pandas as pd
21
+ # import logging
22
+ # from dotenv import load_dotenv
23
+
24
+ # # Load environment variables
25
+ # load_dotenv()
26
+
27
+ # # Set up structured logging
28
+ # logging.basicConfig(
29
+ # level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
30
+ # )
31
+ # logger = logging.getLogger(__name__)
32
+
33
+ # description = """A FastAPI application for similarity search with PostgreSQL and OpenAI embeddings.
34
+
35
+ # Direct/API URL:
36
+ # https://re-mind-similarity-search.hf.space
37
+ # """
38
+
39
+ # # Initialize FastAPI app
40
+ # app = FastAPI(
41
+ # title="Similarity Search API",
42
+ # description=description,
43
+ # version="1.0.0",
44
+ # )
45
+
46
+ # app.add_middleware(GZipMiddleware, minimum_size=1000)
47
+
48
+
49
+ # # Dependency to get EmbeddingService
50
+ # def get_embedding_service() -> EmbeddingService:
51
+ # return EmbeddingService(openai_api_key=os.getenv("OPENAI_API_KEY"))
52
+
53
+
54
+ # # Dependency to get HuggingFaceService
55
+ # def get_huggingface_service() -> HuggingFaceService:
56
+ # return HuggingFaceService()
57
+
58
+
59
+ # # Root endpoint redirects to /docs
60
+ # @app.get("/")
61
+ # async def root():
62
+ # return RedirectResponse(url="/docs")
63
+
64
+
65
+ # # Health check endpoint
66
+ # @app.get("/health")
67
+ # async def health_check(db: Database = Depends(get_db)):
68
+ # try:
69
+ # is_healthy = await db.health_check()
70
+ # if not is_healthy:
71
+ # raise HTTPException(status_code=500, detail="Database is unhealthy")
72
+ # return {"status": "healthy"}
73
+ # except HealthCheckError as e:
74
+ # raise HTTPException(status_code=500, detail=str(e))
75
+
76
+
77
+ # # Endpoint to generate embeddings for a list of strings
78
+ # @app.post("/embed")
79
+ # async def embed(
80
+ # request: EmbedRequest,
81
+ # embedding_service: EmbeddingService = Depends(get_embedding_service),
82
+ # ):
83
+ # """
84
+ # Generate embeddings for a list of strings and return them in the response.
85
+ # """
86
+ # try:
87
+ # # Step 1: Generate embeddings
88
+ # logger.info("Generating embeddings for list of texts...")
89
+ # embeddings = await embedding_service.create_embeddings(request.texts)
90
+
91
+ # return JSONResponse(
92
+ # content={
93
+ # "message": "Embeddings generated successfully.",
94
+ # "embeddings": embeddings,
95
+ # "num_texts": len(request.texts),
96
+ # }
97
+ # )
98
+ # except OpenAIError as e:
99
+ # logger.error(f"OpenAI API error: {e}")
100
+ # raise HTTPException(status_code=500, detail=f"OpenAI API error: {e}")
101
+ # except Exception as e:
102
+ # logger.error(f"An error occurred: {e}")
103
+ # raise HTTPException(status_code=500, detail=f"An error occurred: {e}")
104
+
105
+
106
+ # # Endpoint to create embeddings from a database query
107
+ # @app.post("/create_embedding")
108
+ # async def create_embedding(
109
+ # request: CreateEmbeddingRequest,
110
+ # db: Database = Depends(get_db),
111
+ # embedding_service: EmbeddingService = Depends(get_embedding_service),
112
+ # huggingface_service: HuggingFaceService = Depends(get_huggingface_service),
113
+ # ):
114
+ # """
115
+ # Create embeddings for the target column in the dataset.
116
+ # """
117
+ # try:
118
+ # # Step 1: Query the database
119
+ # logger.info("Fetching data from the database...")
120
+ # result = await db.fetch(request.query)
121
+ # df = pd.DataFrame(result)
122
+
123
+ # # Step 2: Generate embeddings
124
+ # df = await embedding_service.create_embeddings(
125
+ # df, request.target_column, request.output_column
126
+ # )
127
+
128
+ # # Step 3: Push to Hugging Face Hub
129
+ # await huggingface_service.push_to_hub(df, request.dataset_name)
130
+
131
+ # return JSONResponse(
132
+ # content={
133
+ # "message": "Embeddings created and pushed to Hugging Face Hub.",
134
+ # "dataset_name": request.dataset_name,
135
+ # "num_rows": len(df),
136
+ # }
137
+ # )
138
+ # except QueryExecutionError as e:
139
+ # logger.error(f"Database query failed: {e}")
140
+ # raise HTTPException(status_code=500, detail=f"Database query failed: {e}")
141
+ # except OpenAIError as e:
142
+ # logger.error(f"OpenAI API error: {e}")
143
+ # raise HTTPException(status_code=500, detail=f"OpenAI API error: {e}")
144
+ # except DatasetPushError as e:
145
+ # logger.error(f"Failed to push dataset: {e}")
146
+ # raise HTTPException(status_code=500, detail=f"Failed to push dataset: {e}")
147
+ # except Exception as e:
148
+ # logger.error(f"An error occurred: {e}")
149
+ # raise HTTPException(status_code=500, detail=f"An error occurred: {e}")
150
+
151
+
152
+ # # Endpoint to read embeddings
153
+ # @app.post("/read_embeddings")
154
+ # async def read_embeddings(
155
+ # request: ReadEmbeddingRequest,
156
+ # huggingface_service: HuggingFaceService = Depends(get_huggingface_service),
157
+ # ):
158
+ # """
159
+ # Read embeddings from a Hugging Face dataset.
160
+ # """
161
+ # try:
162
+ # df = await huggingface_service.read_dataset(request.dataset_name)
163
+ # return df
164
+ # except DatasetNotFoundError as e:
165
+ # logger.error(f"Dataset not found: {e}")
166
+ # raise HTTPException(status_code=404, detail=f"Dataset not found: {e}")
167
+ # except Exception as e:
168
+ # logger.error(f"An error occurred: {e}")
169
+ # raise HTTPException(status_code=500, detail=f"An error occurred: {e}")
170
+
171
+
172
+ # # Endpoint to update embeddings
173
+ # # @app.post("/update_embeddings")
174
+ # # async def update_embeddings(
175
+ # # request: UpdateEmbeddingRequest,
176
+ # # huggingface_service: HuggingFaceService = Depends(get_huggingface_service),
177
+ # # ):
178
+ # # """
179
+ # # Update embeddings in a Hugging Face dataset.
180
+ # # """
181
+ # # try:
182
+ # # df = await huggingface_service.update_dataset(
183
+ # # request.dataset_name, request.updates
184
+ # # )
185
+ # # return {
186
+ # # "message": "Embeddings updated successfully.",
187
+ # # "dataset_name": request.dataset_name,
188
+ # # }
189
+ # # except DatasetPushError as e:
190
+ # # logger.error(f"Failed to update dataset: {e}")
191
+ # # raise HTTPException(status_code=500, detail=f"Failed to update dataset: {e}")
192
+ # # except Exception as e:
193
+ # # logger.error(f"An error occurred: {e}")
194
+ # # raise HTTPException(status_code=500, detail=f"An error occurred: {e}")
195
+
196
+
197
+ # @app.post("/update_embeddings")
198
+ # async def update_embeddings(
199
+ # request: UpdateEmbeddingRequest,
200
+ # huggingface_service: HuggingFaceService = Depends(get_huggingface_service),
201
+ # ):
202
+ # """
203
+ # Update embeddings in a Hugging Face dataset by generating embeddings for new data and concatenating it with the existing dataset.
204
+ # """
205
+ # try:
206
+ # # Call the update_dataset method to generate embeddings, concatenate, and push the updated dataset
207
+ # updated_df = await huggingface_service.update_dataset(
208
+ # request.dataset_name,
209
+ # request.updates,
210
+ # request.target_column,
211
+ # request.output_column,
212
+ # )
213
+
214
+ # return {
215
+ # "message": "Embeddings updated successfully.",
216
+ # "dataset_name": request.dataset_name,
217
+ # "num_rows": len(updated_df),
218
+ # }
219
+ # except DatasetPushError as e:
220
+ # logger.error(f"Failed to update dataset: {e}")
221
+ # raise HTTPException(status_code=500, detail=f"Failed to update dataset: {e}")
222
+ # except Exception as e:
223
+ # logger.error(f"An error occurred: {e}")
224
+ # raise HTTPException(status_code=500, detail=f"An error occurred: {e}")
225
+
226
+
227
+ # # Endpoint to delete embeddings
228
+ # @app.post("/delete_embeddings")
229
+ # async def delete_embeddings(
230
+ # request: DeleteEmbeddingRequest,
231
+ # huggingface_service: HuggingFaceService = Depends(get_huggingface_service),
232
+ # ):
233
+ # """
234
+ # Delete embeddings from a Hugging Face dataset.
235
+ # """
236
+ # try:
237
+ # await huggingface_service.delete_dataset(request.dataset_name)
238
+ # return {
239
+ # "message": "Embeddings deleted successfully.",
240
+ # "dataset_name": request.dataset_name,
241
+ # }
242
+ # except DatasetPushError as e:
243
+ # logger.error(f"Failed to delete columns: {e}")
244
+ # raise HTTPException(status_code=500, detail=f"Failed to delete columns: {e}")
245
+ # except Exception as e:
246
+ # logger.error(f"An error occurred: {e}")
247
+ # raise HTTPException(status_code=500, detail=f"An error occurred: {e}")
248
+
249
+
250
  import os
251
  from fastapi import FastAPI, Depends, HTTPException
252
  from fastapi.responses import JSONResponse, RedirectResponse
253
  from fastapi.middleware.gzip import GZipMiddleware
254
  from pydantic import BaseModel
255
  from typing import List, Dict
256
+ from datasets import Dataset
257
  from src.api.models.embedding_models import (
258
  CreateEmbeddingRequest,
259
  ReadEmbeddingRequest,
 
266
  from src.api.services.huggingface_service import HuggingFaceService
267
  from src.api.exceptions import DatasetNotFoundError, DatasetPushError, OpenAIError
268
 
 
 
269
  import logging
270
  from dotenv import load_dotenv
271
 
 
366
  # Step 1: Query the database
367
  logger.info("Fetching data from the database...")
368
  result = await db.fetch(request.query)
369
+ dataset = Dataset.from_dict(result)
370
 
371
  # Step 2: Generate embeddings
372
+ dataset = await embedding_service.create_embeddings(
373
+ dataset, request.target_column, request.output_column
374
  )
375
 
376
  # Step 3: Push to Hugging Face Hub
377
+ await huggingface_service.push_to_hub(dataset, request.dataset_name)
378
 
379
  return JSONResponse(
380
  content={
381
  "message": "Embeddings created and pushed to Hugging Face Hub.",
382
  "dataset_name": request.dataset_name,
383
+ "num_rows": len(dataset),
384
  }
385
  )
386
  except QueryExecutionError as e:
 
407
  Read embeddings from a Hugging Face dataset.
408
  """
409
  try:
410
+ dataset = await huggingface_service.read_dataset(request.dataset_name)
411
+ return dataset.to_dict()
412
  except DatasetNotFoundError as e:
413
  logger.error(f"Dataset not found: {e}")
414
  raise HTTPException(status_code=404, detail=f"Dataset not found: {e}")
 
418
 
419
 
420
  # Endpoint to update embeddings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
421
  @app.post("/update_embeddings")
422
  async def update_embeddings(
423
  request: UpdateEmbeddingRequest,
 
428
  """
429
  try:
430
  # Call the update_dataset method to generate embeddings, concatenate, and push the updated dataset
431
+ updated_dataset = await huggingface_service.update_dataset(
432
  request.dataset_name,
433
  request.updates,
434
  request.target_column,
 
438
  return {
439
  "message": "Embeddings updated successfully.",
440
  "dataset_name": request.dataset_name,
441
+ "num_rows": len(updated_dataset),
442
  }
443
  except DatasetPushError as e:
444
  logger.error(f"Failed to update dataset: {e}")