Spaces:
Sleeping
Sleeping
File size: 1,195 Bytes
5e20c77 10330bc 5e20c77 10330bc 5e20c77 10330bc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
from chainlit.types import AskFileResponse
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
embeddings = OpenAIEmbeddings()
def process_file(file: AskFileResponse):
import tempfile
if file.type == "text/plain":
Loader = TextLoader
elif file.type == "application/pdf":
Loader = PyPDFDirectoryLoader
with tempfile.NamedTemporaryFile() as tempfile:
tempfile.write(file.content)
loader = Loader(tempfile.name)
documents = loader.load()
# text_splitter = text_splitter()
docs = text_splitter.split_documents(documents)
for i, doc in enumerate(docs):
doc.metadata["source"] = f"source_{i}"
return docs
def get_docSearch(file: AskFileResponse):
docs = process_file(file)
## save data in user session
docsearch = Chroma.from_documents(docs, embeddings)
return docsearch |