import json import os import time import numpy as np import pandas as pd from buster.documents_manager import DeepLakeDocumentsManager DEEPLAKE_DATASET = os.getenv("DEEPLAKE_DATASET", "ai-tutor-dataset-2") DEEPLAKE_ORG = os.getenv("DEEPLAKE_ORG", "towards_ai") df1 = pd.read_csv("data/advanced_rag_course.csv") df2 = pd.read_csv("data/hf_transformers.csv") df3 = pd.read_csv("data/langchain_course.csv") df4 = pd.read_csv("data/filtered_tai_v2.csv") df5 = pd.read_csv("data/wiki.csv") # , encoding="ISO-8859-1") # df6 = pd.read_csv("data/openai.csv") # Broken df7 = pd.read_csv("data/activeloop.csv") df8 = pd.read_csv("data/llm_course.csv") df9 = pd.read_csv("data/langchain_docs.csv") # , encoding="ISO-8859-1") print(len(df1), df1.columns) print(len(df2), df2.columns) print(len(df3), df3.columns) print(len(df4), df4.columns) print(len(df5), df5.columns) # print(len(df6), df6.columns) print(len(df7), df7.columns) print(len(df8), df8.columns) print(len(df9), df9.columns) # dataset_path = f"hub://{DEEPLAKE_ORG}/{DEEPLAKE_DATASET}" dataset_path = f"local_dataset" # dataset_path = f"{DEEPLAKE_DATASET}" dm = DeepLakeDocumentsManager( vector_store_path=dataset_path, overwrite=False, required_columns=["url", "content", "source", "title"], ) dm.batch_add( df=df1, batch_size=3000, min_time_interval=5, num_workers=15, csv_overwrite=False, ) dm.batch_add( df=df2, batch_size=3000, min_time_interval=5, num_workers=15, csv_overwrite=False, ) dm.batch_add( df=df3, batch_size=3000, min_time_interval=5, num_workers=15, csv_overwrite=False, ) dm.batch_add( df=df4, batch_size=3000, min_time_interval=5, num_workers=15, csv_overwrite=False, ) dm.batch_add( df=df5, batch_size=3000, min_time_interval=5, num_workers=15, csv_overwrite=False, ) # ERROR DO NOT ADD # dm.batch_add( # df=df6, # batch_size=3000, # min_time_interval=5, # num_workers=15, # csv_overwrite=False, # ) dm.batch_add( df=df7, batch_size=3000, min_time_interval=5, num_workers=15, csv_overwrite=False, ) dm.batch_add( df=df8, batch_size=3000, min_time_interval=5, num_workers=15, csv_overwrite=False, ) dm.batch_add( df=df9, batch_size=3000, min_time_interval=5, num_workers=15, csv_overwrite=False, ) # dm.batch_add( # df=df2, # batch_size=3000, # min_time_interval=60, # num_workers=32, # csv_embeddings_filename="embeddings.csv", # csv_errors_filename="tmp.csv", # csv_overwrite=False, # ) # dm.batch_add( # df=df3, # batch_size=3000, # min_time_interval=60, # num_workers=32, # csv_embeddings_filename="embeddings.csv", # csv_errors_filename="tmp.csv", # csv_overwrite=False, # ) # dm.batch_add( # df=df4, # batch_size=3000, # min_time_interval=60, # num_workers=32, # csv_embeddings_filename="embeddings.csv", # csv_errors_filename="tmp.csv", # csv_overwrite=False, # ) # dm.batch_add( # df=df5, # batch_size=3000, # min_time_interval=60, # num_workers=32, # csv_embeddings_filename="embeddings.csv", # csv_errors_filename="tmp.csv", # csv_overwrite=False, # ) # dm.batch_add( # df=df6, # batch_size=3000, # min_time_interval=60, # num_workers=32, # csv_embeddings_filename="embeddings.csv", # csv_overwrite=False, # csv_errors_filename="tmp.csv", # ) # dm.batch_add( # df=df7, # batch_size=3000, # min_time_interval=60, # num_workers=32, # csv_embeddings_filename="embeddings.csv", # csv_errors_filename="tmp.csv", # csv_overwrite=False, # ) # client = OpenAI() # openai_embeddings = OpenAIEmbeddings() # def get_embedding(text, model="text-embedding-ada-002"): # # Call to OpenAI's API to create the embedding # response = client.embeddings.create(input=[text], model=model) # # Extract the embedding data from the response # embedding = response.data[0].embedding # # Convert the ndarray to a list # if isinstance(embedding, np.ndarray): # embedding = embedding.tolist() # return embedding # vs = VectorStore( # dataset_path, # runtime='compute_engine', # token=os.environ['ACTIVELOOP_TOKEN'] # ) # data = vs.search(query = "select * where shape(embedding)[0] == 0") # vs.update_embedding(embedding_source_tensor = "text", # query = "select * where shape(embedding)[0] == 0", # exec_option = "compute_engine", # embedding_function=get_embedding) # data2 = vs.search(query = "select * where shape(embedding)[0] == 0")