from typing import Optional, Dict, Union, List try: from pinecone import Pinecone from pinecone.config import Config except ImportError: raise ImportError( "The `pinecone-client` package is not installed, please install using `pip install pinecone-client`." ) from phi.document import Document from phi.embedder import Embedder from phi.vectordb.base import VectorDb from phi.utils.log import logger from pinecone.core.client.api.manage_indexes_api import ManageIndexesApi from pinecone.models import ServerlessSpec, PodSpec from pinecone.core.client.models import Vector class PineconeDB(VectorDb): """A class representing a Pinecone database. Args: name (str): The name of the index. dimension (int): The dimension of the embeddings. spec (Union[Dict, ServerlessSpec, PodSpec]): The index spec. metric (Optional[str], optional): The metric used for similarity search. Defaults to "cosine". additional_headers (Optional[Dict[str, str]], optional): Additional headers to pass to the Pinecone client. Defaults to {}. pool_threads (Optional[int], optional): The number of threads to use for the Pinecone client. Defaults to 1. timeout (Optional[int], optional): The timeout for Pinecone operations. Defaults to None. index_api (Optional[ManageIndexesApi], optional): The Index API object. Defaults to None. api_key (Optional[str], optional): The Pinecone API key. Defaults to None. host (Optional[str], optional): The Pinecone host. Defaults to None. config (Optional[Config], optional): The Pinecone config. Defaults to None. **kwargs: Additional keyword arguments. Attributes: client (Pinecone): The Pinecone client. index: The Pinecone index. api_key (Optional[str]): The Pinecone API key. host (Optional[str]): The Pinecone host. config (Optional[Config]): The Pinecone config. additional_headers (Optional[Dict[str, str]]): Additional headers to pass to the Pinecone client. pool_threads (Optional[int]): The number of threads to use for the Pinecone client. index_api (Optional[ManageIndexesApi]): The Index API object. name (str): The name of the index. dimension (int): The dimension of the embeddings. spec (Union[Dict, ServerlessSpec, PodSpec]): The index spec. metric (Optional[str]): The metric used for similarity search. timeout (Optional[int]): The timeout for Pinecone operations. kwargs (Optional[Dict[str, str]]): Additional keyword arguments. """ def __init__( self, name: str, dimension: int, spec: Union[Dict, ServerlessSpec, PodSpec], embedder: Optional[Embedder] = None, metric: Optional[str] = "cosine", additional_headers: Optional[Dict[str, str]] = None, pool_threads: Optional[int] = 1, namespace: Optional[str] = None, timeout: Optional[int] = None, index_api: Optional[ManageIndexesApi] = None, api_key: Optional[str] = None, host: Optional[str] = None, config: Optional[Config] = None, **kwargs, ): self._client = None self._index = None self.api_key: Optional[str] = api_key self.host: Optional[str] = host self.config: Optional[Config] = config self.additional_headers: Dict[str, str] = additional_headers or {} self.pool_threads: Optional[int] = pool_threads self.namespace: Optional[str] = namespace self.index_api: Optional[ManageIndexesApi] = index_api self.name: str = name self.dimension: int = dimension self.spec: Union[Dict, ServerlessSpec, PodSpec] = spec self.metric: Optional[str] = metric self.timeout: Optional[int] = timeout self.kwargs: Optional[Dict[str, str]] = kwargs # Embedder for embedding the document contents _embedder = embedder if _embedder is None: from phi.embedder.openai import OpenAIEmbedder _embedder = OpenAIEmbedder() self.embedder: Embedder = _embedder @property def client(self) -> Pinecone: """The Pinecone client. Returns: Pinecone: The Pinecone client. """ if self._client is None: logger.debug("Creating Pinecone Client") self._client = Pinecone( api_key=self.api_key, host=self.host, config=self.config, additional_headers=self.additional_headers, pool_threads=self.pool_threads, index_api=self.index_api, **self.kwargs, ) return self._client @property def index(self): """The Pinecone index. Returns: Pinecone.Index: The Pinecone index. """ if self._index is None: logger.debug(f"Connecting to Pinecone Index: {self.name}") self._index = self.client.Index(self.name) return self._index def exists(self) -> bool: """Check if the index exists. Returns: bool: True if the index exists, False otherwise. """ list_indexes = self.client.list_indexes() return self.name in list_indexes.names() def create(self) -> None: """Create the index if it does not exist.""" if not self.exists(): logger.debug(f"Creating index: {self.name}") self.client.create_index( name=self.name, dimension=self.dimension, spec=self.spec, metric=self.metric if self.metric is not None else "cosine", timeout=self.timeout, ) def delete(self) -> None: """Delete the index if it exists.""" if self.exists(): logger.debug(f"Deleting index: {self.name}") self.client.delete_index(name=self.name, timeout=self.timeout) def doc_exists(self, document: Document) -> bool: """Check if a document exists in the index. Args: document (Document): The document to check. Returns: bool: True if the document exists, False otherwise. """ response = self.index.fetch(ids=[document.id]) return len(response.vectors) > 0 def name_exists(self, name: str) -> bool: """Check if an index with the given name exists. Args: name (str): The name of the index. Returns: bool: True if the index exists, False otherwise. """ try: self.client.describe_index(name) return True except Exception: return False def upsert( self, documents: List[Document], namespace: Optional[str] = None, batch_size: Optional[int] = None, show_progress: bool = False, ) -> None: """insert documents into the index. Args: documents (List[Document]): The documents to upsert. namespace (Optional[str], optional): The namespace for the documents. Defaults to None. batch_size (Optional[int], optional): The batch size for upsert. Defaults to None. show_progress (bool, optional): Whether to show progress during upsert. Defaults to False. """ vectors = [] for document in documents: document.embed(embedder=self.embedder) document.meta_data["text"] = document.content vectors.append( Vector( id=document.id, values=document.embedding, metadata=document.meta_data, ) ) self.index.upsert( vectors=vectors, namespace=namespace, batch_size=batch_size, show_progress=show_progress, ) def upsert_available(self) -> bool: """Check if upsert operation is available. Returns: bool: True if upsert is available, False otherwise. """ return True def insert(self, documents: List[Document]) -> None: """Insert documents into the index. This method is not supported by Pinecone. Use `upsert` instead. Args: documents (List[Document]): The documents to insert. Raises: NotImplementedError: This method is not supported by Pinecone. """ raise NotImplementedError("Pinecone does not support insert operations. Use upsert instead.") def search( self, query: str, limit: int = 5, namespace: Optional[str] = None, filter: Optional[Dict[str, Union[str, float, int, bool, List, dict]]] = None, include_values: Optional[bool] = None, ) -> List[Document]: """Search for similar documents in the index. Args: query (str): The query to search for. limit (int, optional): The maximum number of results to return. Defaults to 5. namespace (Optional[str], optional): The namespace to search in. Defaults to None. filter (Optional[Dict[str, Union[str, float, int, bool, List, dict]]], optional): The filter for the search. Defaults to None. include_values (Optional[bool], optional): Whether to include values in the search results. Defaults to None. include_metadata (Optional[bool], optional): Whether to include metadata in the search results. Defaults to None. Returns: List[Document]: The list of matching documents. """ query_embedding = self.embedder.get_embedding(query) if query_embedding is None: logger.error(f"Error getting embedding for Query: {query}") return [] response = self.index.query( vector=query_embedding, top_k=limit, namespace=namespace, filter=filter, include_values=include_values, include_metadata=True, ) return [ Document( content=(result.metadata.get("text", "") if result.metadata is not None else ""), id=result.id, embedding=result.values, meta_data=result.metadata, ) for result in response.matches ] def optimize(self) -> None: """Optimize the index. This method can be left empty as Pinecone automatically optimizes indexes. """ pass def clear(self, namespace: Optional[str] = None) -> bool: """Clear the index. Args: namespace (Optional[str], optional): The namespace to clear. Defaults to None. """ try: self.index.delete(delete_all=True, namespace=namespace) return True except Exception: return False