Spaces:
Runtime error
Runtime error
File size: 7,676 Bytes
58d33f0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 |
"""Wrapper around Activeloop Deep Lake."""
from __future__ import annotations
import logging
import uuid
from typing import Any, Iterable, List, Optional, Sequence
import numpy as np
from langchain.docstore.document import Document
from langchain.embeddings.base import Embeddings
from langchain.vectorstores.base import VectorStore
logger = logging.getLogger()
def L2_search(
query_embedding: np.ndarray, data_vectors: np.ndarray, k: int = 4
) -> list:
"""naive L2 search for nearest neighbors"""
# Calculate the L2 distance between the query_vector and all data_vectors
distances = np.linalg.norm(data_vectors - query_embedding, axis=1)
# Sort the distances and return the indices of the k nearest vectors
nearest_indices = np.argsort(distances)[:k]
return nearest_indices.tolist()
class DeepLake(VectorStore):
"""Wrapper around Deep Lake, a data lake for deep learning applications.
It not only stores embeddings, but also the original data and queries with
version control automatically enabled.
It is more than just a vector store. You can use the dataset to fine-tune
your own LLM models or use it for other downstream tasks.
We implement naive similiarity search, but it can be extended with Tensor
Query Language (TQL for production use cases) over billion rows.
To use, you should have the ``deeplake`` python package installed.
Example:
.. code-block:: python
from langchain.vectorstores import DeepLake
from langchain.embeddings.openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()
vectorstore = DeepLake("langchain_store", embeddings.embed_query)
"""
_LANGCHAIN_DEFAULT_DEEPLAKE_PATH = "mem://langchain"
def __init__(
self,
dataset_path: str = _LANGCHAIN_DEFAULT_DEEPLAKE_PATH,
token: Optional[str] = None,
embedding_function: Optional[Embeddings] = None,
) -> None:
"""Initialize with Deep Lake client."""
try:
import deeplake
except ImportError:
raise ValueError(
"Could not import deeplake python package. "
"Please install it with `pip install deeplake`."
)
self._deeplake = deeplake
if deeplake.exists(dataset_path, token=token):
self.ds = deeplake.load(dataset_path, token=token)
logger.warning(
f"Deep Lake Dataset in {dataset_path} already exists, "
f"loading from the storage"
)
self.ds.summary()
else:
self.ds = deeplake.empty(dataset_path, token=token, overwrite=True)
with self.ds:
self.ds.create_tensor("text", htype="text")
self.ds.create_tensor("metadata", htype="json")
self.ds.create_tensor("embedding", htype="generic")
self.ds.create_tensor("ids", htype="text")
self._embedding_function = embedding_function
def add_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None,
**kwargs: Any,
) -> List[str]:
"""Run more texts through the embeddings and add to the vectorstore.
Args:
texts (Iterable[str]): Texts to add to the vectorstore.
metadatas (Optional[List[dict]], optional): Optional list of metadatas.
ids (Optional[List[str]], optional): Optional list of IDs.
Returns:
List[str]: List of IDs of the added texts.
"""
if ids is None:
ids = [str(uuid.uuid1()) for _ in texts]
text_list = list(texts)
if self._embedding_function is None:
embeddings: Sequence[Optional[List[float]]] = [None] * len(text_list)
else:
embeddings = self._embedding_function.embed_documents(text_list)
if metadatas is None:
metadatas_to_use: Sequence[Optional[dict]] = [None] * len(text_list)
else:
metadatas_to_use = metadatas
elements = zip(text_list, embeddings, metadatas_to_use, ids)
@self._deeplake.compute
def ingest(sample_in: list, sample_out: list) -> None:
s = {
"text": sample_in[0],
"embedding": sample_in[1],
"metadata": sample_in[2],
"ids": sample_in[3],
}
sample_out.append(s)
ingest().eval(list(elements), self.ds)
self.ds.commit()
return ids
def similarity_search(
self, query: str, k: int = 4, **kwargs: Any
) -> List[Document]:
"""Return docs most similar to query."""
if self._embedding_function is None:
self.ds.summary()
ds_view = self.ds.filter(lambda x: query in x["text"].data()["value"])
else:
query_emb = np.array(self._embedding_function.embed_query(query))
embeddings = self.ds.embedding.numpy()
indices = L2_search(query_emb, embeddings, k=k)
ds_view = self.ds[indices]
docs = [
Document(
page_content=el["text"].data()["value"],
metadata=el["metadata"].data()["value"],
)
for el in ds_view
]
return docs
@classmethod
def from_texts(
cls,
texts: List[str],
embedding: Optional[Embeddings] = None,
metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None,
dataset_path: str = _LANGCHAIN_DEFAULT_DEEPLAKE_PATH,
**kwargs: Any,
) -> DeepLake:
"""Create a Deep Lake dataset from a raw documents.
If a persist_directory is specified, the collection will be persisted there.
Otherwise, the data will be ephemeral in-memory.
Args:
path (str, pathlib.Path): - The full path to the dataset. Can be:
- a Deep Lake cloud path of the form ``hub://username/datasetname``.
To write to Deep Lake cloud datasets,
ensure that you are logged in to Deep Lake
(use 'activeloop login' from command line)
- an s3 path of the form ``s3://bucketname/path/to/dataset``.
Credentials are required in either the environment or
passed to the creds argument.
- a local file system path of the form ``./path/to/dataset`` or
``~/path/to/dataset`` or ``path/to/dataset``.
- a memory path of the form ``mem://path/to/dataset`` which doesn't
save the dataset but keeps it in memory instead.
Should be used only for testing as it does not persist.
documents (List[Document]): List of documents to add.
embedding (Optional[Embeddings]): Embedding function. Defaults to None.
metadatas (Optional[List[dict]]): List of metadatas. Defaults to None.
ids (Optional[List[str]]): List of document IDs. Defaults to None.
Returns:
DeepLake: Deep Lake dataset.
"""
deeplake_dataset = cls(
dataset_path=dataset_path,
embedding_function=embedding,
)
deeplake_dataset.add_texts(texts=texts, metadatas=metadatas, ids=ids)
return deeplake_dataset
def delete_dataset(self) -> None:
"""Delete the collection."""
self.ds.delete()
def persist(self) -> None:
"""Persist the collection."""
self.ds.flush()
|