|
from pinecone import Pinecone |
|
from langchain_community.document_loaders import Docx2txtLoader, PyPDFLoader |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain_community.embeddings import HuggingFaceEmbeddings |
|
import os, uuid |
|
|
|
model_name = os.environ.get('MODEL_NAME', 'all-MiniLM-L6-v2') |
|
|
|
def create_vector_store_index(file_path): |
|
|
|
file_path_split = file_path.split(".") |
|
file_type = file_path_split[-1].rstrip('/') |
|
|
|
if file_type == 'csv': |
|
loader = Docx2txtLoader(file_path) |
|
|
|
elif file_type == 'pdf': |
|
loader = PyPDFLoader(file_path) |
|
|
|
pages = loader.load() |
|
|
|
text_splitter = RecursiveCharacterTextSplitter( |
|
chunk_size = 512, |
|
chunk_overlap = 128) |
|
|
|
docs = text_splitter.split_documents(pages) |
|
|
|
pc = Pinecone( |
|
api_key=os.environ.get("PINECONE_API_KEY"), |
|
) |
|
|
|
index = pc.Index(os.environ.get("PINECONE_INDEX")) |
|
|
|
embeddings = HuggingFaceEmbeddings(model_name=f"./model/{model_name}") |
|
|
|
batch_size = 32 |
|
|
|
for i in range(0, len(docs), batch_size): |
|
i_end = min(len(docs), i+batch_size) |
|
batch = docs[i:i_end] |
|
ids = [str(uuid.uuid4()) for _ in batch] |
|
texts = [x.page_content for x in batch] |
|
embeds = embeddings.embed_documents(texts) |
|
metadata = [ |
|
{'text': x.page_content, **x.metadata} for x in batch |
|
] |
|
index.upsert(vectors=zip(ids, embeds, metadata)) |
|
|
|
return "Vector store index is created." |
|
|
|
|
|
def upload_and_create_vector_store(files): |
|
|
|
for file in files: |
|
index_success_msg = create_vector_store_index(file) |
|
|
|
return index_success_msg |
|
|