File size: 1,598 Bytes
b4b5bdf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import openai
import pandas as pd
from deeplake.core.vectorstore import VectorStore

from utils import zip_contents


def embedding_function(texts, model="text-embedding-ada-002"):
    if isinstance(texts, str):
        texts = [texts]

    texts = [t.replace("\n", " ") for t in texts]
    return [
        data["embedding"]
        for data in openai.Embedding.create(input=texts, model=model)["data"]
    ]


def extract_metadata(df: pd.DataFrame) -> dict:
    """extract the metadata from the dataframe in deeplake dict format"""
    metadata = df.apply(
        lambda x: {
            "url": x.url,
            "source": x.source,
            "title": x.title,
        },
        axis=1,
    ).to_list()
    return metadata


if __name__ == "__main__":
    vector_store_path = "deeplake_store"
    chunk_file = "data/chunks_preprocessed.csv"
    overwrite = True
    df = pd.read_csv(chunk_file)

    for col in ["url", "source", "title", "content"]:
        assert col in df.columns

    # extract the text + metadata
    metadata = extract_metadata(df)
    chunked_text = df.content.to_list()

    # init the vector store
    vector_store = VectorStore(
        path=vector_store_path,
        overwrite=True,
    )

    # add the embeddings
    vector_store.add(
        text=chunked_text,
        embedding_function=embedding_function,
        embedding_data=chunked_text,
        metadata=metadata,
    )

    # save the deeplake folder to a zip file
    zipped_file_path = zip_contents(input_path=vector_store_path, output_path=".")
    print(f"Contents zipped to: {zipped_file_path}")