File size: 1,641 Bytes
5f0df75 8ea257e 5f0df75 8ea257e 5f0df75 258ffa4 5f0df75 77bd9cf 5f0df75 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
from pinecone import Pinecone
from langchain_community.document_loaders import Docx2txtLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
import os, uuid
model_name = os.environ.get('MODEL_NAME', 'all-MiniLM-L6-v2')
def create_vector_store_index(file_path):
file_path_split = file_path.split(".")
file_type = file_path_split[-1].rstrip('/')
if file_type == 'csv':
loader = Docx2txtLoader(file_path)
elif file_type == 'pdf':
loader = PyPDFLoader(file_path)
pages = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size = 512,
chunk_overlap = 128)
docs = text_splitter.split_documents(pages)
pc = Pinecone(
api_key=os.environ.get("PINECONE_API_KEY"),
)
index = pc.Index(os.environ.get("PINECONE_INDEX"))
embeddings = HuggingFaceEmbeddings(model_name=f"./model/{model_name}")
batch_size = 32
for i in range(0, len(docs), batch_size):
i_end = min(len(docs), i+batch_size)
batch = docs[i:i_end]
ids = [str(uuid.uuid4()) for _ in batch]
texts = [x.page_content for x in batch]
embeds = embeddings.embed_documents(texts)
metadata = [
{'text': x.page_content, **x.metadata} for x in batch
]
index.upsert(vectors=zip(ids, embeds, metadata))
return "Vector store index is created."
def upload_and_create_vector_store(files):
for file in files:
index_success_msg = create_vector_store_index(file)
return index_success_msg
|