Spaces:

towardsai-tutors
/

buster

Running

buster / embed_documents.py

Deeplake support (#1)

b4b5bdf unverified over 1 year ago

1.6 kB

	import openai
	import pandas as pd
	from deeplake.core.vectorstore import VectorStore

	from utils import zip_contents


	def embedding_function(texts, model="text-embedding-ada-002"):
	if isinstance(texts, str):
	texts = [texts]

	texts = [t.replace("\n", " ") for t in texts]
	return [
	data["embedding"]
	for data in openai.Embedding.create(input=texts, model=model)["data"]
	]


	def extract_metadata(df: pd.DataFrame) -> dict:
	"""extract the metadata from the dataframe in deeplake dict format"""
	metadata = df.apply(
	lambda x: {
	"url": x.url,
	"source": x.source,
	"title": x.title,
	},
	axis=1,
	).to_list()
	return metadata


	if __name__ == "__main__":
	vector_store_path = "deeplake_store"
	chunk_file = "data/chunks_preprocessed.csv"
	overwrite = True
	df = pd.read_csv(chunk_file)

	for col in ["url", "source", "title", "content"]:
	assert col in df.columns

	# extract the text + metadata
	metadata = extract_metadata(df)
	chunked_text = df.content.to_list()

	# init the vector store
	vector_store = VectorStore(
	path=vector_store_path,
	overwrite=True,
	)

	# add the embeddings
	vector_store.add(
	text=chunked_text,
	embedding_function=embedding_function,
	embedding_data=chunked_text,
	metadata=metadata,
	)

	# save the deeplake folder to a zip file
	zipped_file_path = zip_contents(input_path=vector_store_path, output_path=".")
	print(f"Contents zipped to: {zipped_file_path}")