frenchlaw / src /vector_index.py
Ferdi's picture
change model directory
77bd9cf
raw
history blame
1.64 kB
from pinecone import Pinecone
from langchain_community.document_loaders import Docx2txtLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
import os, uuid
model_name = os.environ.get('MODEL_NAME', 'all-MiniLM-L6-v2')
def create_vector_store_index(file_path):
file_path_split = file_path.split(".")
file_type = file_path_split[-1].rstrip('/')
if file_type == 'csv':
loader = Docx2txtLoader(file_path)
elif file_type == 'pdf':
loader = PyPDFLoader(file_path)
pages = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size = 512,
chunk_overlap = 128)
docs = text_splitter.split_documents(pages)
pc = Pinecone(
api_key=os.environ.get("PINECONE_API_KEY"),
)
index = pc.Index(os.environ.get("PINECONE_INDEX"))
embeddings = HuggingFaceEmbeddings(model_name=f"./model/{model_name}")
batch_size = 32
for i in range(0, len(docs), batch_size):
i_end = min(len(docs), i+batch_size)
batch = docs[i:i_end]
ids = [str(uuid.uuid4()) for _ in batch]
texts = [x.page_content for x in batch]
embeds = embeddings.embed_documents(texts)
metadata = [
{'text': x.page_content, **x.metadata} for x in batch
]
index.upsert(vectors=zip(ids, embeds, metadata))
return "Vector store index is created."
def upload_and_create_vector_store(files):
for file in files:
index_success_msg = create_vector_store_index(file)
return index_success_msg