File size: 1,641 Bytes
5f0df75
8ea257e
5f0df75
8ea257e
5f0df75
 
258ffa4
 
5f0df75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77bd9cf
5f0df75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
from pinecone import Pinecone
from langchain_community.document_loaders import Docx2txtLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
import os, uuid

model_name = os.environ.get('MODEL_NAME', 'all-MiniLM-L6-v2')

def create_vector_store_index(file_path):

    file_path_split = file_path.split(".")
    file_type = file_path_split[-1].rstrip('/')

    if file_type == 'csv':
        loader = Docx2txtLoader(file_path)

    elif file_type == 'pdf':
        loader = PyPDFLoader(file_path)
    
    pages = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 512,
        chunk_overlap = 128)

    docs = text_splitter.split_documents(pages)

    pc = Pinecone(
        api_key=os.environ.get("PINECONE_API_KEY"),
    )

    index = pc.Index(os.environ.get("PINECONE_INDEX"))

    embeddings = HuggingFaceEmbeddings(model_name=f"./model/{model_name}")

    batch_size = 32

    for i in range(0, len(docs), batch_size):
        i_end = min(len(docs), i+batch_size)
        batch = docs[i:i_end]
        ids = [str(uuid.uuid4()) for _ in batch]
        texts = [x.page_content for x in batch]
        embeds = embeddings.embed_documents(texts)
        metadata = [
            {'text': x.page_content, **x.metadata} for x in batch
        ]
        index.upsert(vectors=zip(ids, embeds, metadata))

    return "Vector store index is created."


def upload_and_create_vector_store(files):

    for file in files:
        index_success_msg = create_vector_store_index(file)

    return index_success_msg