Document-Reader / src /utils.py
singhjagpreet's picture
creating embedding from docs
99a3f34
raw
history blame
2.14 kB
from chainlit.types import AskFileResponse
import click
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.vectorstores import Chroma
from src.config import Config
# import chainlit as cl
import logging
import openai
import os
from dotenv import load_dotenv
load_dotenv()
def process_file(file: AskFileResponse):
import tempfile
if file.type == "text/plain":
Loader = TextLoader
elif file.type == "application/pdf":
Loader = PyPDFDirectoryLoader
with tempfile.NamedTemporaryFile() as tempfile:
tempfile.write(file.content)
loader = Loader(tempfile.name)
documents = loader.load()
# text_splitter = text_splitter()
docs = Config.text_splitter.split_documents(documents)
for i, doc in enumerate(docs):
doc.metadata["source"] = f"source_{i}"
return docs
def get_docSearch(file,cl):
docs = process_file(file)
logging.info("files loaded ")
## save data in user session
cl.user_session.set("docs",docs)
logging.info("docs saved in active session")
docsearch = Chroma.from_documents(docs, Config.embeddings)
logging.info("embedding completed")
return docsearch
def get_source(sources,all_sources,docs,cl):
answer = []
source_elements = []
if sources:
found_sources = []
# Add the sources to the message
for source in sources.split(","):
source_name = source.strip().replace(".", "")
# Get the index of the source
try:
index = all_sources.index(source_name)
except ValueError:
continue
text = docs[index].page_content
found_sources.append(source_name)
# Create the text element referenced in the message
source_elements.append(cl.Text(content=text, name=source_name))
if found_sources:
answer += f"\nSources: {', '.join(found_sources)}"
else:
answer += "\nNo sources found"
return source_elements,answer