Spaces:
Running
Running
import openai | |
import pandas as pd | |
from deeplake.core.vectorstore import VectorStore | |
from utils import zip_contents | |
def embedding_function(texts, model="text-embedding-ada-002"): | |
if isinstance(texts, str): | |
texts = [texts] | |
texts = [t.replace("\n", " ") for t in texts] | |
return [ | |
data["embedding"] | |
for data in openai.Embedding.create(input=texts, model=model)["data"] | |
] | |
def extract_metadata(df: pd.DataFrame) -> dict: | |
"""extract the metadata from the dataframe in deeplake dict format""" | |
metadata = df.apply( | |
lambda x: { | |
"url": x.url, | |
"source": x.source, | |
"title": x.title, | |
}, | |
axis=1, | |
).to_list() | |
return metadata | |
if __name__ == "__main__": | |
vector_store_path = "deeplake_store" | |
chunk_file = "data/chunks_preprocessed.csv" | |
overwrite = True | |
df = pd.read_csv(chunk_file) | |
for col in ["url", "source", "title", "content"]: | |
assert col in df.columns | |
# extract the text + metadata | |
metadata = extract_metadata(df) | |
chunked_text = df.content.to_list() | |
# init the vector store | |
vector_store = VectorStore( | |
path=vector_store_path, | |
overwrite=True, | |
) | |
# add the embeddings | |
vector_store.add( | |
text=chunked_text, | |
embedding_function=embedding_function, | |
embedding_data=chunked_text, | |
metadata=metadata, | |
) | |
# save the deeplake folder to a zip file | |
zipped_file_path = zip_contents(input_path=vector_store_path, output_path=".") | |
print(f"Contents zipped to: {zipped_file_path}") | |