Spaces:
Sleeping
Sleeping
Upload 3 files
Browse files- main.py +122 -0
- requirements.txt +11 -0
- util.py +101 -0
main.py
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from util import *
|
2 |
+
from streamlit_option_menu import option_menu
|
3 |
+
from langchain_core.prompts import ChatPromptTemplate
|
4 |
+
from langchain_groq import ChatGroq
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
|
7 |
+
# --- PAGE CONFIGURATION ---
|
8 |
+
st.set_page_config(page_title="Doc Chat", page_icon=":robot_face:", layout="centered")
|
9 |
+
|
10 |
+
# --- SETUP SESSION STATE VARIABLES ---
|
11 |
+
if "vector_store" not in st.session_state:
|
12 |
+
st.session_state.vector_store = False
|
13 |
+
if "response" not in st.session_state:
|
14 |
+
st.session_state.response = None
|
15 |
+
if "prompt_activation" not in st.session_state:
|
16 |
+
st.session_state.prompt_activation = False
|
17 |
+
if "conversation" not in st.session_state:
|
18 |
+
st.session_state.conversation = None
|
19 |
+
if "chat_history" not in st.session_state:
|
20 |
+
st.session_state.chat_history = None
|
21 |
+
if "prompt" not in st.session_state:
|
22 |
+
st.session_state.prompt = False
|
23 |
+
|
24 |
+
load_dotenv()
|
25 |
+
|
26 |
+
# --- SIDEBAR CONFIGURATION ---
|
27 |
+
st.sidebar.header('Configuration')
|
28 |
+
groq_api_key = sidebar_api_key_configuration()
|
29 |
+
model = sidebar_groq_model_selection()
|
30 |
+
|
31 |
+
# --- MAIN PAGE CONFIGURATION ---
|
32 |
+
st.title("Doc Chat :robot_face:")
|
33 |
+
st.write("*Interrogate Documents :books:, Ignite Insights: AI at Your Service*")
|
34 |
+
st.write(':blue[***Powered by Groq AI Inference Technology***]')
|
35 |
+
|
36 |
+
# ---- NAVIGATION MENU -----
|
37 |
+
selected = option_menu(
|
38 |
+
menu_title=None,
|
39 |
+
options=["Doc Chat", "Reference", "About"],
|
40 |
+
icons=["robot", "bi-file-text-fill", "app"], # https://icons.getbootstrap.com
|
41 |
+
orientation="horizontal",
|
42 |
+
)
|
43 |
+
|
44 |
+
llm = ChatGroq(groq_api_key=groq_api_key, model_name=model)
|
45 |
+
|
46 |
+
prompt = ChatPromptTemplate.from_template(
|
47 |
+
"""
|
48 |
+
Answer the question based on the provided context only. If question is not within the context, do not try to answer
|
49 |
+
and respond that the asked question is out of context or something similar.
|
50 |
+
Please provide the most accurate response based on the question.
|
51 |
+
<context>
|
52 |
+
{context}
|
53 |
+
Questions: {input}
|
54 |
+
"""
|
55 |
+
)
|
56 |
+
# ----- SETUP Doc Chat MENU ------
|
57 |
+
if selected == "Doc Chat":
|
58 |
+
st.subheader("Upload PDF(s)")
|
59 |
+
pdf_docs = st.file_uploader("Upload your PDFs", type=['pdf'], accept_multiple_files=True,
|
60 |
+
disabled=not st.session_state.prompt_activation, label_visibility='collapsed')
|
61 |
+
process = st.button("Process", type="primary", key="process", disabled=not pdf_docs)
|
62 |
+
|
63 |
+
if process:
|
64 |
+
with st.spinner("Processing ..."):
|
65 |
+
st.session_state.vector_store = create_vectorstore(pdf_docs)
|
66 |
+
st.session_state.prompt = True
|
67 |
+
st.success('Database is ready')
|
68 |
+
|
69 |
+
st.divider()
|
70 |
+
|
71 |
+
if "messages" not in st.session_state:
|
72 |
+
st.session_state["messages"] = [{"role": "assistant", "content": "How can I help you?"}]
|
73 |
+
|
74 |
+
for msg in st.session_state.messages:
|
75 |
+
st.chat_message(msg["role"]).write(msg["content"])
|
76 |
+
|
77 |
+
container = st.container(border=True)
|
78 |
+
if question := st.chat_input(placeholder='Enter your question related to uploaded document',
|
79 |
+
disabled=not st.session_state.prompt):
|
80 |
+
st.session_state.messages.append({"role": "user", "content": question})
|
81 |
+
st.chat_message("user").write(question)
|
82 |
+
|
83 |
+
with st.spinner('Processing...'):
|
84 |
+
st.session_state.response = get_llm_response(llm, prompt, question)
|
85 |
+
st.session_state.messages.append({"role": "assistant", "content": st.session_state.response['answer']})
|
86 |
+
st.chat_message("assistant").write(st.session_state.response['answer'])
|
87 |
+
|
88 |
+
# ----- SETUP REFERENCE MENU ------
|
89 |
+
if selected == "Reference":
|
90 |
+
st.title("Reference & Context")
|
91 |
+
if st.session_state.response is not None:
|
92 |
+
for i, doc in enumerate(st.session_state.response["context"]):
|
93 |
+
with st.expander(f'Reference # {i + 1}'):
|
94 |
+
st.write(doc.page_content)
|
95 |
+
|
96 |
+
# ----- SETUP ABOUT MENU ------
|
97 |
+
if selected == "About":
|
98 |
+
with st.expander("About this App"):
|
99 |
+
st.markdown(''' This app allows you to chat with your PDF documents. It has following functionality:
|
100 |
+
|
101 |
+
- Allows to chat with multiple PDF documents
|
102 |
+
- Support of Groq AI inference technology
|
103 |
+
- Display the response context and document reference
|
104 |
+
|
105 |
+
''')
|
106 |
+
with st.expander("Which Large Language models are supported by this App?"):
|
107 |
+
st.markdown(''' This app supports the following LLMs as supported by Groq:
|
108 |
+
|
109 |
+
- Chat Models -- Groq
|
110 |
+
- Llama3-8b-8192
|
111 |
+
- Llama3-70b-8192
|
112 |
+
- Mixtral-8x7b-32768
|
113 |
+
- Gemma-7b-it
|
114 |
+
''')
|
115 |
+
|
116 |
+
with st.expander("Which library is used for vectorstore?"):
|
117 |
+
st.markdown(''' This app supports the FAISS for AI similarity search and vectorstore:
|
118 |
+
''')
|
119 |
+
|
120 |
+
with st.expander("Whom to contact regarding this app?"):
|
121 |
+
st.markdown(''' Contact [Sree Narayanan]([email protected])
|
122 |
+
''')
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pypdf
|
2 |
+
langchain
|
3 |
+
langchain-core
|
4 |
+
langchain-groq
|
5 |
+
langchain-community
|
6 |
+
streamlit
|
7 |
+
streamlit-option-menu
|
8 |
+
python-dotenv
|
9 |
+
boto3
|
10 |
+
faiss-cpu
|
11 |
+
gpt4all
|
util.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pypdf import PdfReader
|
2 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
3 |
+
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
|
4 |
+
from langchain_community.embeddings.ollama import OllamaEmbeddings
|
5 |
+
from langchain_community.embeddings.bedrock import BedrockEmbeddings
|
6 |
+
from langchain_community.vectorstores import FAISS
|
7 |
+
from langchain.chains.combine_documents import create_stuff_documents_chain
|
8 |
+
from langchain.chains import create_retrieval_chain
|
9 |
+
from dotenv import load_dotenv
|
10 |
+
import streamlit as st
|
11 |
+
import os
|
12 |
+
|
13 |
+
load_dotenv()
|
14 |
+
|
15 |
+
|
16 |
+
# Function to get the API key
|
17 |
+
def get_api_key():
|
18 |
+
# Try to get the API key from st.secrets first
|
19 |
+
try:
|
20 |
+
groq_api_key = os.getenv("GROQ_API_KEY", "")
|
21 |
+
|
22 |
+
return groq_api_key
|
23 |
+
except Exception as e:
|
24 |
+
print(e)
|
25 |
+
|
26 |
+
def get_inference_api_key():
|
27 |
+
try:
|
28 |
+
inference_api_key = os.getenv("INFERENCE_API_KEY", "")
|
29 |
+
|
30 |
+
return inference_api_key
|
31 |
+
except Exception as e:
|
32 |
+
print(e)
|
33 |
+
|
34 |
+
|
35 |
+
# Function for API configuration at sidebar
|
36 |
+
def sidebar_api_key_configuration():
|
37 |
+
groq_api_key = get_api_key()
|
38 |
+
if groq_api_key == '':
|
39 |
+
st.sidebar.warning('Enter the API Key(s) 🗝️')
|
40 |
+
st.session_state.prompt_activation = False
|
41 |
+
elif (groq_api_key.startswith('gsk_') and (len(groq_api_key) == 56)):
|
42 |
+
st.sidebar.success('Lets Proceed!', icon='️👉')
|
43 |
+
st.session_state.prompt_activation = True
|
44 |
+
else:
|
45 |
+
st.sidebar.warning('Please enter the correct API Key 🗝️!', icon='⚠️')
|
46 |
+
st.session_state.prompt_activation = False
|
47 |
+
return groq_api_key
|
48 |
+
|
49 |
+
|
50 |
+
def sidebar_groq_model_selection():
|
51 |
+
st.sidebar.subheader("Model Selection")
|
52 |
+
model = st.sidebar.selectbox('Select the Model', ('Llama3-8b-8192', 'Llama3-70b-8192', 'Mixtral-8x7b-32768',
|
53 |
+
'Gemma-7b-it'), label_visibility="collapsed")
|
54 |
+
return model
|
55 |
+
|
56 |
+
|
57 |
+
# Read PDF data
|
58 |
+
def read_pdf_data(pdf_docs):
|
59 |
+
text = ""
|
60 |
+
for pdf in pdf_docs:
|
61 |
+
pdf_reader = PdfReader(pdf)
|
62 |
+
for page in pdf_reader.pages:
|
63 |
+
text += page.extract_text()
|
64 |
+
return text
|
65 |
+
|
66 |
+
|
67 |
+
# Split data into chunks
|
68 |
+
def split_data(text):
|
69 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
70 |
+
text_chunks = text_splitter.split_text(text)
|
71 |
+
return text_chunks
|
72 |
+
|
73 |
+
|
74 |
+
def get_embedding_function():
|
75 |
+
# embeddings = BedrockEmbeddings(
|
76 |
+
# credentials_profile_name="default", region_name="us-east-1"
|
77 |
+
# )
|
78 |
+
#embeddings = OllamaEmbeddings(model="nomic-embed-text")
|
79 |
+
inference_api_key = get_inference_api_key()
|
80 |
+
|
81 |
+
embeddings = HuggingFaceInferenceAPIEmbeddings(
|
82 |
+
api_key=inference_api_key, model_name="sentence-transformers/all-MiniLM-l6-v2"
|
83 |
+
)
|
84 |
+
return embeddings
|
85 |
+
|
86 |
+
|
87 |
+
# Create vectorstore
|
88 |
+
def create_vectorstore(pdf_docs):
|
89 |
+
raw_text = read_pdf_data(pdf_docs) # Get PDF text
|
90 |
+
text_chunks = split_data(raw_text) # Get the text chunks
|
91 |
+
embeddings = get_embedding_function() # Get the embedding function
|
92 |
+
vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
|
93 |
+
return vectorstore
|
94 |
+
|
95 |
+
|
96 |
+
# Get response from llm of user asked question
|
97 |
+
def get_llm_response(llm, prompt, question):
|
98 |
+
document_chain = create_stuff_documents_chain(llm, prompt)
|
99 |
+
retrieval_chain = create_retrieval_chain(st.session_state.vector_store.as_retriever(), document_chain)
|
100 |
+
response = retrieval_chain.invoke({'input': question})
|
101 |
+
return response
|