Spaces:
Sleeping
Sleeping
File size: 1,901 Bytes
5e20c77 99a3f34 5e20c77 1cb46fc 10330bc c0c01c6 10330bc 99a3f34 1cb46fc c0c01c6 5e20c77 c0c01c6 10330bc 1cb46fc 10330bc c0c01c6 10330bc 5e20c77 c0c01c6 10330bc c0c01c6 99a3f34 c0c01c6 99a3f34 c0c01c6 99a3f34 c0c01c6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
from chainlit.types import AskFileResponse
import click
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
import chainlit as cl
from src.config import Config
import logging
text_splitter = RecursiveCharacterTextSplitter()
embeddings = OpenAIEmbeddings()
def process_file(file: AskFileResponse):
import tempfile
if file.type == "text/plain":
Loader = TextLoader
elif file.type == "application/pdf":
Loader = PyPDFLoader
with tempfile.NamedTemporaryFile() as tempfile:
tempfile.write(file.content)
loader = Loader(tempfile.name)
documents = loader.load()
docs = text_splitter.split_documents(documents)
for i, doc in enumerate(docs):
doc.metadata["source"] = f"source_{i}"
return docs
def get_docsearch(file: AskFileResponse):
docs = process_file(file)
# Save data in the user session
cl.user_session.set("docs", docs)
# Create a unique namespace for the file
docsearch = Chroma.from_documents(
docs, embeddings
)
return docsearch
def get_source(answer,source_documents):
text_elements = []
if source_documents:
for source_idx, source_doc in enumerate(source_documents):
source_name = f"source_{source_idx}"
text_elements.append(
cl.Text(content=source_doc.page_content, name=source_name)
)
source_names = [text_el.name for text_el in text_elements]
if source_names:
answer += f"\nSources: {', '.join(source_names)}"
else:
answer += "\nNo source found"
return text_elements |