Spaces:
Runtime error
Runtime error
from typing import Optional, Dict, Union, List | |
try: | |
from pinecone import Pinecone | |
from pinecone.config import Config | |
except ImportError: | |
raise ImportError( | |
"The `pinecone-client` package is not installed, please install using `pip install pinecone-client`." | |
) | |
from phi.document import Document | |
from phi.embedder import Embedder | |
from phi.vectordb.base import VectorDb | |
from phi.utils.log import logger | |
from pinecone.core.client.api.manage_indexes_api import ManageIndexesApi | |
from pinecone.models import ServerlessSpec, PodSpec | |
from pinecone.core.client.models import Vector | |
class PineconeDB(VectorDb): | |
"""A class representing a Pinecone database. | |
Args: | |
name (str): The name of the index. | |
dimension (int): The dimension of the embeddings. | |
spec (Union[Dict, ServerlessSpec, PodSpec]): The index spec. | |
metric (Optional[str], optional): The metric used for similarity search. Defaults to "cosine". | |
additional_headers (Optional[Dict[str, str]], optional): Additional headers to pass to the Pinecone client. Defaults to {}. | |
pool_threads (Optional[int], optional): The number of threads to use for the Pinecone client. Defaults to 1. | |
timeout (Optional[int], optional): The timeout for Pinecone operations. Defaults to None. | |
index_api (Optional[ManageIndexesApi], optional): The Index API object. Defaults to None. | |
api_key (Optional[str], optional): The Pinecone API key. Defaults to None. | |
host (Optional[str], optional): The Pinecone host. Defaults to None. | |
config (Optional[Config], optional): The Pinecone config. Defaults to None. | |
**kwargs: Additional keyword arguments. | |
Attributes: | |
client (Pinecone): The Pinecone client. | |
index: The Pinecone index. | |
api_key (Optional[str]): The Pinecone API key. | |
host (Optional[str]): The Pinecone host. | |
config (Optional[Config]): The Pinecone config. | |
additional_headers (Optional[Dict[str, str]]): Additional headers to pass to the Pinecone client. | |
pool_threads (Optional[int]): The number of threads to use for the Pinecone client. | |
index_api (Optional[ManageIndexesApi]): The Index API object. | |
name (str): The name of the index. | |
dimension (int): The dimension of the embeddings. | |
spec (Union[Dict, ServerlessSpec, PodSpec]): The index spec. | |
metric (Optional[str]): The metric used for similarity search. | |
timeout (Optional[int]): The timeout for Pinecone operations. | |
kwargs (Optional[Dict[str, str]]): Additional keyword arguments. | |
""" | |
def __init__( | |
self, | |
name: str, | |
dimension: int, | |
spec: Union[Dict, ServerlessSpec, PodSpec], | |
embedder: Optional[Embedder] = None, | |
metric: Optional[str] = "cosine", | |
additional_headers: Optional[Dict[str, str]] = None, | |
pool_threads: Optional[int] = 1, | |
namespace: Optional[str] = None, | |
timeout: Optional[int] = None, | |
index_api: Optional[ManageIndexesApi] = None, | |
api_key: Optional[str] = None, | |
host: Optional[str] = None, | |
config: Optional[Config] = None, | |
**kwargs, | |
): | |
self._client = None | |
self._index = None | |
self.api_key: Optional[str] = api_key | |
self.host: Optional[str] = host | |
self.config: Optional[Config] = config | |
self.additional_headers: Dict[str, str] = additional_headers or {} | |
self.pool_threads: Optional[int] = pool_threads | |
self.namespace: Optional[str] = namespace | |
self.index_api: Optional[ManageIndexesApi] = index_api | |
self.name: str = name | |
self.dimension: int = dimension | |
self.spec: Union[Dict, ServerlessSpec, PodSpec] = spec | |
self.metric: Optional[str] = metric | |
self.timeout: Optional[int] = timeout | |
self.kwargs: Optional[Dict[str, str]] = kwargs | |
# Embedder for embedding the document contents | |
_embedder = embedder | |
if _embedder is None: | |
from phi.embedder.openai import OpenAIEmbedder | |
_embedder = OpenAIEmbedder() | |
self.embedder: Embedder = _embedder | |
def client(self) -> Pinecone: | |
"""The Pinecone client. | |
Returns: | |
Pinecone: The Pinecone client. | |
""" | |
if self._client is None: | |
logger.debug("Creating Pinecone Client") | |
self._client = Pinecone( | |
api_key=self.api_key, | |
host=self.host, | |
config=self.config, | |
additional_headers=self.additional_headers, | |
pool_threads=self.pool_threads, | |
index_api=self.index_api, | |
**self.kwargs, | |
) | |
return self._client | |
def index(self): | |
"""The Pinecone index. | |
Returns: | |
Pinecone.Index: The Pinecone index. | |
""" | |
if self._index is None: | |
logger.debug(f"Connecting to Pinecone Index: {self.name}") | |
self._index = self.client.Index(self.name) | |
return self._index | |
def exists(self) -> bool: | |
"""Check if the index exists. | |
Returns: | |
bool: True if the index exists, False otherwise. | |
""" | |
list_indexes = self.client.list_indexes() | |
return self.name in list_indexes.names() | |
def create(self) -> None: | |
"""Create the index if it does not exist.""" | |
if not self.exists(): | |
logger.debug(f"Creating index: {self.name}") | |
self.client.create_index( | |
name=self.name, | |
dimension=self.dimension, | |
spec=self.spec, | |
metric=self.metric if self.metric is not None else "cosine", | |
timeout=self.timeout, | |
) | |
def delete(self) -> None: | |
"""Delete the index if it exists.""" | |
if self.exists(): | |
logger.debug(f"Deleting index: {self.name}") | |
self.client.delete_index(name=self.name, timeout=self.timeout) | |
def doc_exists(self, document: Document) -> bool: | |
"""Check if a document exists in the index. | |
Args: | |
document (Document): The document to check. | |
Returns: | |
bool: True if the document exists, False otherwise. | |
""" | |
response = self.index.fetch(ids=[document.id]) | |
return len(response.vectors) > 0 | |
def name_exists(self, name: str) -> bool: | |
"""Check if an index with the given name exists. | |
Args: | |
name (str): The name of the index. | |
Returns: | |
bool: True if the index exists, False otherwise. | |
""" | |
try: | |
self.client.describe_index(name) | |
return True | |
except Exception: | |
return False | |
def upsert( | |
self, | |
documents: List[Document], | |
namespace: Optional[str] = None, | |
batch_size: Optional[int] = None, | |
show_progress: bool = False, | |
) -> None: | |
"""insert documents into the index. | |
Args: | |
documents (List[Document]): The documents to upsert. | |
namespace (Optional[str], optional): The namespace for the documents. Defaults to None. | |
batch_size (Optional[int], optional): The batch size for upsert. Defaults to None. | |
show_progress (bool, optional): Whether to show progress during upsert. Defaults to False. | |
""" | |
vectors = [] | |
for document in documents: | |
document.embed(embedder=self.embedder) | |
document.meta_data["text"] = document.content | |
vectors.append( | |
Vector( | |
id=document.id, | |
values=document.embedding, | |
metadata=document.meta_data, | |
) | |
) | |
self.index.upsert( | |
vectors=vectors, | |
namespace=namespace, | |
batch_size=batch_size, | |
show_progress=show_progress, | |
) | |
def upsert_available(self) -> bool: | |
"""Check if upsert operation is available. | |
Returns: | |
bool: True if upsert is available, False otherwise. | |
""" | |
return True | |
def insert(self, documents: List[Document]) -> None: | |
"""Insert documents into the index. | |
This method is not supported by Pinecone. Use `upsert` instead. | |
Args: | |
documents (List[Document]): The documents to insert. | |
Raises: | |
NotImplementedError: This method is not supported by Pinecone. | |
""" | |
raise NotImplementedError("Pinecone does not support insert operations. Use upsert instead.") | |
def search( | |
self, | |
query: str, | |
limit: int = 5, | |
namespace: Optional[str] = None, | |
filter: Optional[Dict[str, Union[str, float, int, bool, List, dict]]] = None, | |
include_values: Optional[bool] = None, | |
) -> List[Document]: | |
"""Search for similar documents in the index. | |
Args: | |
query (str): The query to search for. | |
limit (int, optional): The maximum number of results to return. Defaults to 5. | |
namespace (Optional[str], optional): The namespace to search in. Defaults to None. | |
filter (Optional[Dict[str, Union[str, float, int, bool, List, dict]]], optional): The filter for the search. Defaults to None. | |
include_values (Optional[bool], optional): Whether to include values in the search results. Defaults to None. | |
include_metadata (Optional[bool], optional): Whether to include metadata in the search results. Defaults to None. | |
Returns: | |
List[Document]: The list of matching documents. | |
""" | |
query_embedding = self.embedder.get_embedding(query) | |
if query_embedding is None: | |
logger.error(f"Error getting embedding for Query: {query}") | |
return [] | |
response = self.index.query( | |
vector=query_embedding, | |
top_k=limit, | |
namespace=namespace, | |
filter=filter, | |
include_values=include_values, | |
include_metadata=True, | |
) | |
return [ | |
Document( | |
content=(result.metadata.get("text", "") if result.metadata is not None else ""), | |
id=result.id, | |
embedding=result.values, | |
meta_data=result.metadata, | |
) | |
for result in response.matches | |
] | |
def optimize(self) -> None: | |
"""Optimize the index. | |
This method can be left empty as Pinecone automatically optimizes indexes. | |
""" | |
pass | |
def clear(self, namespace: Optional[str] = None) -> bool: | |
"""Clear the index. | |
Args: | |
namespace (Optional[str], optional): The namespace to clear. Defaults to None. | |
""" | |
try: | |
self.index.delete(delete_all=True, namespace=namespace) | |
return True | |
except Exception: | |
return False | |