Backend / app /data_indexing.py
bachephysicdun's picture
comment out loading local environmental params
c6db65d
import os
import uuid
from pathlib import Path
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
# from dotenv import load_dotenv
# # Specify the path to the .env file two directories up
# env_path = Path(__file__).resolve().parents[2] / '.env'
# load_dotenv(dotenv_path=env_path)
current_dir = Path(__file__).resolve().parent
class DataIndexer:
source_file = os.path.join(current_dir, 'sources.txt')
def __init__(self, index_name='langchain-repo'):
# TODO: choose your embedding model
# self.embedding_client = InferenceClient(
# "dunzhang/stella_en_1.5B_v5",
# token=os.environ['HF_TOKEN'],
# )
self.embedding_client = OpenAIEmbeddings()
self.index_name = index_name
self.pinecone_client = Pinecone(api_key=os.environ.get('PINECONE_API_KEY'))
if index_name not in self.pinecone_client.list_indexes().names():
# TODO: create your index if it doesn't exist. Use the create_index function.
# Make sure to choose the dimension that corresponds to your embedding model
self.pinecone_client.create_index(
name=index_name,
dimension=1536,
metric='cosine',
spec=ServerlessSpec(cloud='aws', region='us-east-1')
)
self.index = self.pinecone_client.Index(self.index_name)
# TODO: make sure to build the index.
self.source_index = self.get_source_index()
def get_source_index(self):
if not os.path.isfile(self.source_file):
print('No source file')
return None
print('create source index')
with open(self.source_file, 'r') as file:
sources = file.readlines()
sources = [s.rstrip('\n') for s in sources]
vectorstore = Chroma.from_texts(
sources, embedding=self.embedding_client
)
return vectorstore
def index_data(self, docs, batch_size=32):
with open(self.source_file, 'a') as file:
for doc in docs:
file.writelines(doc.metadata['source'] + '\n')
for i in range(0, len(docs), batch_size):
batch = docs[i: i + batch_size]
# create a list of the vector representations of each text data in the batch
# based on the selected model, choose you extract values
# values = self.embedding_client.embed_documents([
# doc.page_content for doc in batch
# ])
# values = self.embedding_client.feature_extraction([
# doc.page_content for doc in batch
# ])
values = self.embedding_client.embed_documents([
doc.page_content for doc in batch
]) # list of vectors -> vector presentation of the doc
# create a list of unique identifiers for each element in the batch with the uuid package.
vector_ids = [str(uuid.uuid4()) for _ in batch]
# create a list of dictionaries representing the metadata. Capture the text data
# with the "text" key, and make sure to capture the rest of the doc.metadata.
metadatas = [{
'text': doc.page_content, **doc.metadata
} for doc in batch]
# create a list of dictionaries with keys "id" (the unique identifiers), "values"
# (the vector representation), and "metadata" (the metadata).
vectors = [{
'id': vector_id,
'values': value,
'metadata': metadata
} for vector_id, value, metadata in zip(vector_ids, values, metadatas)]
try:
# TODO: Use the function upsert to upload the data to the database.
upsert_response = self.index.upsert(vectors=vectors)
print(upsert_response)
except Exception as e:
print(e)
def search(self, text_query, top_k=5, hybrid_search=False):
filter = None
if hybrid_search and self.source_index:
# I implemented the filtering process to pull the 50 most relevant file names
# to the question. Make sure to adjust this number as you see fit.
source_docs = self.source_index.similarity_search(text_query, 50)
filter = {"source": {"$in":[doc.page_content for doc in source_docs]}}
# TODO: embed the text_query by using the embedding model
# TODO: choose your embedding model
# vector = self.embedding_client.feature_extraction(text_query)
# vector = self.embedding_client.embed_query(text_query)
vector = self.embedding_client.embed_query(text_query)
# TODO: use the vector representation of the text_query to
# search the database by using the query function.
result = self.index.query(
# namespace=self.index_name,
vector=vector,
filter=filter,
top_k=top_k,
include_metadata=True,
)
docs = []
for res in result["matches"]:
# TODO: From the result's metadata, extract the "text" element.
metadata = res['metadata']
if 'text' in metadata:
text = metadata.pop('text')
docs.append(text)
return docs
if __name__ == '__main__':
from langchain_community.document_loaders import GitLoader
from langchain_text_splitters import (
Language,
RecursiveCharacterTextSplitter,
)
print('start the GitLoader')
loader = GitLoader(
clone_url="https://github.com/langchain-ai/langchain",
repo_path="./code_data/langchain_repo/",
branch="master",
)
print('perfrom python splitter')
python_splitter = RecursiveCharacterTextSplitter.from_language(
language=Language.PYTHON, chunk_size=10000, chunk_overlap=100
)
docs = loader.load()
docs = [doc for doc in docs if doc.metadata['file_type'] in ['.py', '.md']]
docs = [doc for doc in docs if len(doc.page_content) < 50000]
docs = python_splitter.split_documents(docs)
for doc in docs:
doc.page_content = '# {}\n\n'.format(doc.metadata['source']) + doc.page_content
print('instantiat the data indexer')
indexer = DataIndexer()
# with open('/app/sources.txt', 'a') as file:
with open(indexer.source_file, 'a') as file:
for doc in docs:
file.writelines(doc.metadata['source'] + '\n')
indexer.index_data(docs)