captain-awesome's picture
Update app.py
8ed4f55 verified
# from langchain.chains import ConversationalRetrievalChain
# from langchain.chains.question_answering import load_qa_chain
# from langchain.chains import RetrievalQA
# from langchain.memory import ConversationBufferMemory
# from langchain.memory import ConversationTokenBufferMemory
# from langchain.llms import HuggingFacePipeline
# # from langchain import PromptTemplate
# from langchain.prompts import PromptTemplate
# from langchain.embeddings import HuggingFaceEmbeddings
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
# from langchain.vectorstores import Chroma
# from chromadb.utils import embedding_functions
# from langchain.embeddings import SentenceTransformerEmbeddings
# from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.llms import Llamacpp
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.document_loaders import (
CSVLoader,
DirectoryLoader,
GitLoader,
NotebookLoader,
OnlinePDFLoader,
PythonLoader,
TextLoader,
UnstructuredFileLoader,
UnstructuredHTMLLoader,
UnstructuredPDFLoader,
UnstructuredWordDocumentLoader,
WebBaseLoader,
PyPDFLoader,
UnstructuredMarkdownLoader,
UnstructuredEPubLoader,
UnstructuredHTMLLoader,
UnstructuredPowerPointLoader,
UnstructuredODTLoader,
NotebookLoader,
UnstructuredFileLoader
)
# from transformers import (
# AutoModelForCausalLM,
# AutoTokenizer,
# StoppingCriteria,
# StoppingCriteriaList,
# pipeline,
# GenerationConfig,
# TextStreamer,
# pipeline
# )
# from langchain.llms import HuggingFaceHub
import torch
# from transformers import BitsAndBytesConfig
import os
# from langchain.llms import CTransformers
import streamlit as st
# from langchain.document_loaders.base import BaseLoader
# from langchain.schema import Document
# import gradio as gr
import tempfile
import timeit
import textwrap
# from chromadb.utils import embedding_functions
# from tqdm import tqdm
# tqdm(disable=True, total=0) # initialise internal lock
# tqdm.write("test")
from langchain import PromptTemplate, LLMChain
from langchain.llms import CTransformers
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceBgeEmbeddings
from io import BytesIO
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import FAISS
# def load_model():
# config = {'max_new_tokens': 1024,
# 'repetition_penalty': 1.1,
# 'temperature': 0.1,
# 'top_k': 50,
# 'top_p': 0.9,
# 'stream': True,
# 'threads': int(os.cpu_count() / 2)
# }
# llm = CTransformers(
# model = "TheBloke/zephyr-7B-beta-GGUF",
# model_file = "zephyr-7b-beta.Q4_0.gguf",
# callbacks=[StreamingStdOutCallbackHandler()],
# lib="avx2", #for CPU use
# **config
# # model_type=model_type,
# # max_new_tokens=max_new_tokens, # type: ignore
# # temperature=temperature, # type: ignore
# )
# return llm
# def create_vector_database(loaded_documents):
# # DB_DIR: str = os.path.join(ABS_PATH, "db")
# """
# Creates a vector database using document loaders and embeddings.
# This function loads data from PDF, markdown and text files in the 'data/' directory,
# splits the loaded documents into chunks, transforms them into embeddings using HuggingFace,
# and finally persists the embeddings into a Chroma vector database.
# """
# # Split loaded documents into chunks
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=30, length_function = len)
# chunked_documents = text_splitter.split_documents(loaded_documents)
# # embeddings = HuggingFaceEmbeddings(
# # model_name="sentence-transformers/all-MiniLM-L6-v2"
# # # model_name = "sentence-transformers/all-mpnet-base-v2"
# # )
# embeddings = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
# # embeddings = HuggingFaceBgeEmbeddings(
# # model_name = "BAAI/bge-large-en"
# # )
# # model_name = "BAAI/bge-large-en"
# # model_kwargs = {'device': 'cpu'}
# # encode_kwargs = {'normalize_embeddings': False}
# # embeddings = HuggingFaceBgeEmbeddings(
# # model_name=model_name,
# # model_kwargs=model_kwargs,
# # encode_kwargs=encode_kwargs
# # )
# persist_directory = 'db'
# # Create and persist a Chroma vector database from the chunked documents
# db = Chroma.from_documents(
# documents=chunked_documents,
# embedding=embeddings,
# persist_directory=persist_directory
# # persist_directory=DB_DIR,
# )
# db.persist()
# # db = Chroma(persist_directory=persist_directory,
# # embedding_function=embedding)
# return db
# def set_custom_prompt():
# """
# Prompt template for retrieval for each vectorstore
# """
# prompt_template = """Use the following pieces of information to answer the user's question.
# If you don't know the answer, just say that you don't know, don't try to make up an answer.
# Context: {context}
# Question: {question}
# Only return the helpful answer below and nothing else.
# Helpful answer:
# """
# prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
# return prompt
# def create_chain(llm, prompt, db):
# """
# Creates a Retrieval Question-Answering (QA) chain using a given language model, prompt, and database.
# This function initializes a ConversationalRetrievalChain object with a specific chain type and configurations,
# and returns this chain. The retriever is set up to return the top 3 results (k=3).
# Args:
# llm (any): The language model to be used in the RetrievalQA.
# prompt (str): The prompt to be used in the chain type.
# db (any): The database to be used as the
# retriever.
# Returns:
# ConversationalRetrievalChain: The initialized conversational chain.
# """
# memory = ConversationTokenBufferMemory(llm=llm, memory_key="chat_history", return_messages=True, input_key='question', output_key='answer')
# # chain = ConversationalRetrievalChain.from_llm(
# # llm=llm,
# # chain_type="stuff",
# # retriever=db.as_retriever(search_kwargs={"k": 3}),
# # return_source_documents=True,
# # max_tokens_limit=256,
# # combine_docs_chain_kwargs={"prompt": prompt},
# # condense_question_prompt=CONDENSE_QUESTION_PROMPT,
# # memory=memory,
# # )
# # chain = RetrievalQA.from_chain_type(llm=llm,
# # chain_type='stuff',
# # retriever=db.as_retriever(search_kwargs={'k': 3}),
# # return_source_documents=True,
# # chain_type_kwargs={'prompt': prompt}
# # )
# chain = RetrievalQA.from_chain_type(llm=llm,
# chain_type='stuff',
# retriever=db.as_retriever(search_kwargs={'k': 3}),
# return_source_documents=True
# )
# return chain
# def create_retrieval_qa_bot(loaded_documents):
# # if not os.path.exists(persist_dir):
# # raise FileNotFoundError(f"No directory found at {persist_dir}")
# try:
# llm = load_model() # Assuming this function exists and works as expected
# except Exception as e:
# raise Exception(f"Failed to load model: {str(e)}")
# try:
# prompt = set_custom_prompt() # Assuming this function exists and works as expected
# except Exception as e:
# raise Exception(f"Failed to get prompt: {str(e)}")
# # try:
# # CONDENSE_QUESTION_PROMPT = set_custom_prompt_condense() # Assuming this function exists and works as expected
# # except Exception as e:
# # raise Exception(f"Failed to get condense prompt: {str(e)}")
# try:
# db = create_vector_database(loaded_documents) # Assuming this function exists and works as expected
# except Exception as e:
# raise Exception(f"Failed to get database: {str(e)}")
# try:
# # qa = create_chain(
# # llm=llm, prompt=prompt,CONDENSE_QUESTION_PROMPT=CONDENSE_QUESTION_PROMPT, db=db
# # ) # Assuming this function exists and works as expected
# qa = create_chain(
# llm=llm, prompt=prompt, db=db
# ) # Assuming this function exists and works as expected
# except Exception as e:
# raise Exception(f"Failed to create retrieval QA chain: {str(e)}")
# return qa
# def wrap_text_preserve_newlines(text, width=110):
# # Split the input text into lines based on newline characters
# lines = text.split('\n')
# # Wrap each line individually
# wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
# # Join the wrapped lines back together using newline characters
# wrapped_text = '\n'.join(wrapped_lines)
# return wrapped_text
# def retrieve_bot_answer(query, loaded_documents):
# """
# Retrieves the answer to a given query using a QA bot.
# This function creates an instance of a QA bot, passes the query to it,
# and returns the bot's response.
# Args:
# query (str): The question to be answered by the QA bot.
# Returns:
# dict: The QA bot's response, typically a dictionary with response details.
# """
# qa_bot_instance = create_retrieval_qa_bot(loaded_documents)
# # bot_response = qa_bot_instance({"question": query})
# bot_response = qa_bot_instance({"query": query})
# # Check if the 'answer' key exists in the bot_response dictionary
# # if 'answer' in bot_response:
# # # answer = bot_response['answer']
# # return bot_response
# # else:
# # raise KeyError("Expected 'answer' key in bot_response, but it was not found.")
# # result = bot_response['answer']
# # result = bot_response['result']
# # sources = []
# # for source in bot_response["source_documents"]:
# # sources.append(source.metadata['source'])
# # return result, sources
# result = wrap_text_preserve_newlines(bot_response['result'])
# for source in bot_response["source_documents"]:
# sources.append(source.metadata['source'])
# return result, sources
def main():
FILE_LOADER_MAPPING = {
"csv": (CSVLoader, {"encoding": "utf-8"}),
"doc": (UnstructuredWordDocumentLoader, {}),
"docx": (UnstructuredWordDocumentLoader, {}),
"epub": (UnstructuredEPubLoader, {}),
"html": (UnstructuredHTMLLoader, {}),
"md": (UnstructuredMarkdownLoader, {}),
"odt": (UnstructuredODTLoader, {}),
"pdf": (PyPDFLoader, {}),
"ppt": (UnstructuredPowerPointLoader, {}),
"pptx": (UnstructuredPowerPointLoader, {}),
"txt": (TextLoader, {"encoding": "utf8"}),
"ipynb": (NotebookLoader, {}),
"py": (PythonLoader, {}),
# Add more mappings for other file extensions and loaders as needed
}
st.title("Docuverse")
# Upload files
uploaded_files = st.file_uploader("Upload your documents", type=["pdf", "md", "txt", "csv", "py", "epub", "html", "ppt", "pptx", "doc", "docx", "odt", "ipynb"], accept_multiple_files=True)
loaded_documents = []
if uploaded_files:
# Create a temporary directory
with tempfile.TemporaryDirectory() as td:
# Move the uploaded files to the temporary directory and process them
for uploaded_file in uploaded_files:
st.write(f"Uploaded: {uploaded_file.name}")
ext = os.path.splitext(uploaded_file.name)[-1][1:].lower()
st.write(f"Uploaded: {ext}")
# Check if the extension is in FILE_LOADER_MAPPING
if ext in FILE_LOADER_MAPPING:
loader_class, loader_args = FILE_LOADER_MAPPING[ext]
# st.write(f"loader_class: {loader_class}")
# Save the uploaded file to the temporary directory
file_path = os.path.join(td, uploaded_file.name)
with open(file_path, 'wb') as temp_file:
temp_file.write(uploaded_file.read())
# Use Langchain loader to process the file
loader = loader_class(file_path, **loader_args)
loaded_documents.extend(loader.load())
else:
st.warning(f"Unsupported file extension: {ext}")
# st.write(f"loaded_documents: {loaded_documents}")
st.write("Chat with the Document:")
query = st.text_input("Ask a question:")
if st.button("Get Answer"):
if query:
# Load model, set prompts, create vector database, and retrieve answer
try:
start = timeit.default_timer()
config = {
'max_new_tokens': 1024,
'repetition_penalty': 1.1,
'temperature': 0.1,
'top_k': 50,
'top_p': 0.9,
'stream': True,
'threads': int(os.cpu_count() / 2)
}
# llm = CTransformers(
# model = "TheBloke/zephyr-7B-beta-GGUF",
# model_file = "zephyr-7b-beta.Q4_0.gguf",
# model_type="mistral",
# lib="avx2", #for CPU use
# **config
# )
llm = Llamacpp(model_name = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF",temperature=0.75,max_tokens=2000,top_p=1)
st.write("LLM Initialized:")
model_name = "BAAI/bge-large-en"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceBgeEmbeddings(
model_name=model_name,
model_kwargs=model_kwargs,
encode_kwargs=encode_kwargs
)
# embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl",
# model_kwargs={"device": "cpu"})
# llm = load_model()
# prompt = set_custom_prompt()
# CONDENSE_QUESTION_PROMPT = set_custom_prompt_condense()
# db = create_vector_database(loaded_documents)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=30, length_function = len)
chunked_documents = text_splitter.split_documents(loaded_documents)
persist_directory = 'db'
# Create and persist a Chroma vector database from the chunked documents
db = FAISS.from_documents(chunked_documents, embeddings)
# db = Chroma.from_documents(documents=chunked_documents,embedding=embeddings,persist_directory=persist_directory)
# db.persist()
retriever = db.as_retriever(search_kwargs={"k":1})
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True, verbose=True)
bot_response = qa(query)
lines = bot_response['result'].split('\n')
wrapped_lines = [textwrap.fill(line, width=50) for line in lines]
wrapped_text = '\n'.join(wrapped_lines)
for source in bot_response["source_documents"]:
sources = source.metadata['source']
end = timeit.default_timer()
st.write("Elapsed time:")
st.write(end - start)
# st.write(f"response: {response}")
# Display bot response
st.write("Bot Response:")
st.write(wrapped_text)
st.write(sources)
except Exception as e:
st.error(f"An error occurred: {str(e)}")
else:
st.warning("Please enter a question.")
if __name__ == "__main__":
main()