buster / embed_documents.py
jerpint's picture
Deeplake support (#1)
b4b5bdf unverified
raw
history blame
1.6 kB
import openai
import pandas as pd
from deeplake.core.vectorstore import VectorStore
from utils import zip_contents
def embedding_function(texts, model="text-embedding-ada-002"):
if isinstance(texts, str):
texts = [texts]
texts = [t.replace("\n", " ") for t in texts]
return [
data["embedding"]
for data in openai.Embedding.create(input=texts, model=model)["data"]
]
def extract_metadata(df: pd.DataFrame) -> dict:
"""extract the metadata from the dataframe in deeplake dict format"""
metadata = df.apply(
lambda x: {
"url": x.url,
"source": x.source,
"title": x.title,
},
axis=1,
).to_list()
return metadata
if __name__ == "__main__":
vector_store_path = "deeplake_store"
chunk_file = "data/chunks_preprocessed.csv"
overwrite = True
df = pd.read_csv(chunk_file)
for col in ["url", "source", "title", "content"]:
assert col in df.columns
# extract the text + metadata
metadata = extract_metadata(df)
chunked_text = df.content.to_list()
# init the vector store
vector_store = VectorStore(
path=vector_store_path,
overwrite=True,
)
# add the embeddings
vector_store.add(
text=chunked_text,
embedding_function=embedding_function,
embedding_data=chunked_text,
metadata=metadata,
)
# save the deeplake folder to a zip file
zipped_file_path = zip_contents(input_path=vector_store_path, output_path=".")
print(f"Contents zipped to: {zipped_file_path}")