File size: 1,195 Bytes
5e20c77
 
10330bc
 
 
 
 
 
 
5e20c77
 
10330bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5e20c77
 
10330bc
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from chainlit.types import AskFileResponse
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
embeddings = OpenAIEmbeddings()

def process_file(file: AskFileResponse):
    import tempfile 

    if file.type == "text/plain":
        Loader = TextLoader
    elif file.type == "application/pdf":
        Loader = PyPDFDirectoryLoader

    with tempfile.NamedTemporaryFile() as tempfile:
        tempfile.write(file.content)
        loader = Loader(tempfile.name)
        documents = loader.load()
        # text_splitter = text_splitter()
        docs = text_splitter.split_documents(documents)

        for i, doc in enumerate(docs):
            doc.metadata["source"] = f"source_{i}"
        return docs

def get_docSearch(file: AskFileResponse):
    docs = process_file(file)

    ## save data in user session 
    
    docsearch = Chroma.from_documents(docs, embeddings)

    return docsearch