Spaces:
Sleeping
Sleeping
from chainlit.types import AskFileResponse | |
from langchain.document_loaders import TextLoader | |
from langchain.document_loaders import PyPDFDirectoryLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.vectorstores import Chroma | |
from langchain.embeddings import OpenAIEmbeddings | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) | |
embeddings = OpenAIEmbeddings() | |
def process_file(file: AskFileResponse): | |
import tempfile | |
if file.type == "text/plain": | |
Loader = TextLoader | |
elif file.type == "application/pdf": | |
Loader = PyPDFDirectoryLoader | |
with tempfile.NamedTemporaryFile() as tempfile: | |
tempfile.write(file.content) | |
loader = Loader(tempfile.name) | |
documents = loader.load() | |
# text_splitter = text_splitter() | |
docs = text_splitter.split_documents(documents) | |
for i, doc in enumerate(docs): | |
doc.metadata["source"] = f"source_{i}" | |
return docs | |
def get_docSearch(file: AskFileResponse): | |
docs = process_file(file) | |
## save data in user session | |
docsearch = Chroma.from_documents(docs, embeddings) | |
return docsearch |