Document-Reader / src /utils.py
singhjagpreet's picture
loading file into chat
10330bc
raw
history blame
1.2 kB
from chainlit.types import AskFileResponse
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
embeddings = OpenAIEmbeddings()
def process_file(file: AskFileResponse):
import tempfile
if file.type == "text/plain":
Loader = TextLoader
elif file.type == "application/pdf":
Loader = PyPDFDirectoryLoader
with tempfile.NamedTemporaryFile() as tempfile:
tempfile.write(file.content)
loader = Loader(tempfile.name)
documents = loader.load()
# text_splitter = text_splitter()
docs = text_splitter.split_documents(documents)
for i, doc in enumerate(docs):
doc.metadata["source"] = f"source_{i}"
return docs
def get_docSearch(file: AskFileResponse):
docs = process_file(file)
## save data in user session
docsearch = Chroma.from_documents(docs, embeddings)
return docsearch