Spaces:
Sleeping
Sleeping
Upload 12 files
Browse files- .env +3 -0
- .env.example +2 -0
- .gitignore +160 -0
- .python-version +1 -0
- __pycache__/htmlTemplates.cpython-312.pyc +0 -0
- app.py +173 -0
- docs/PDF-LangChain.jpg +0 -0
- documents/taxagentknowledgebase.pdf +0 -0
- htmlTemplates.py +48 -0
- new.py +22 -0
- readme.md +69 -0
- requirements.txt +13 -0
.env
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
OPENAI_API_KEY=""
|
2 |
+
HUGGINGFACEHUB_API_TOKEN=""
|
3 |
+
GROK_API_KEY="xai-tHIoTBXBRStLAx2AXugmiP8yEIQ1uJNw5ypsc77pQ8msbG9jz9F3W705y8YJM7AJM7dQlf0fydWtSHrA"
|
.env.example
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
OPENAI_API_KEY=
|
2 |
+
HUGGINGFACEHUB_API_TOKEN=
|
.gitignore
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
share/python-wheels/
|
24 |
+
*.egg-info/
|
25 |
+
.installed.cfg
|
26 |
+
*.egg
|
27 |
+
MANIFEST
|
28 |
+
|
29 |
+
# PyInstaller
|
30 |
+
# Usually these files are written by a python script from a template
|
31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32 |
+
*.manifest
|
33 |
+
*.spec
|
34 |
+
|
35 |
+
# Installer logs
|
36 |
+
pip-log.txt
|
37 |
+
pip-delete-this-directory.txt
|
38 |
+
|
39 |
+
# Unit test / coverage reports
|
40 |
+
htmlcov/
|
41 |
+
.tox/
|
42 |
+
.nox/
|
43 |
+
.coverage
|
44 |
+
.coverage.*
|
45 |
+
.cache
|
46 |
+
nosetests.xml
|
47 |
+
coverage.xml
|
48 |
+
*.cover
|
49 |
+
*.py,cover
|
50 |
+
.hypothesis/
|
51 |
+
.pytest_cache/
|
52 |
+
cover/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
.pybuilder/
|
76 |
+
target/
|
77 |
+
|
78 |
+
# Jupyter Notebook
|
79 |
+
.ipynb_checkpoints
|
80 |
+
|
81 |
+
# IPython
|
82 |
+
profile_default/
|
83 |
+
ipython_config.py
|
84 |
+
|
85 |
+
# pyenv
|
86 |
+
# For a library or package, you might want to ignore these files since the code is
|
87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
88 |
+
# .python-version
|
89 |
+
|
90 |
+
# pipenv
|
91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
94 |
+
# install all needed dependencies.
|
95 |
+
#Pipfile.lock
|
96 |
+
|
97 |
+
# poetry
|
98 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
100 |
+
# commonly ignored for libraries.
|
101 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
102 |
+
#poetry.lock
|
103 |
+
|
104 |
+
# pdm
|
105 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
106 |
+
#pdm.lock
|
107 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
108 |
+
# in version control.
|
109 |
+
# https://pdm.fming.dev/#use-with-ide
|
110 |
+
.pdm.toml
|
111 |
+
|
112 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
113 |
+
__pypackages__/
|
114 |
+
|
115 |
+
# Celery stuff
|
116 |
+
celerybeat-schedule
|
117 |
+
celerybeat.pid
|
118 |
+
|
119 |
+
# SageMath parsed files
|
120 |
+
*.sage.py
|
121 |
+
|
122 |
+
# Environments
|
123 |
+
.env
|
124 |
+
.venv
|
125 |
+
env/
|
126 |
+
venv/
|
127 |
+
ENV/
|
128 |
+
env.bak/
|
129 |
+
venv.bak/
|
130 |
+
|
131 |
+
# Spyder project settings
|
132 |
+
.spyderproject
|
133 |
+
.spyproject
|
134 |
+
|
135 |
+
# Rope project settings
|
136 |
+
.ropeproject
|
137 |
+
|
138 |
+
# mkdocs documentation
|
139 |
+
/site
|
140 |
+
|
141 |
+
# mypy
|
142 |
+
.mypy_cache/
|
143 |
+
.dmypy.json
|
144 |
+
dmypy.json
|
145 |
+
|
146 |
+
# Pyre type checker
|
147 |
+
.pyre/
|
148 |
+
|
149 |
+
# pytype static type analyzer
|
150 |
+
.pytype/
|
151 |
+
|
152 |
+
# Cython debug symbols
|
153 |
+
cython_debug/
|
154 |
+
|
155 |
+
# PyCharm
|
156 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
157 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
158 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
159 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
160 |
+
.idea
|
.python-version
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
3.9
|
__pycache__/htmlTemplates.cpython-312.pyc
ADDED
Binary file (1.16 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import streamlit as st
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
from PyPDF2 import PdfReader
|
5 |
+
from langchain.text_splitter import CharacterTextSplitter
|
6 |
+
from langchain.embeddings import HuggingFaceEmbeddings # Changed to HuggingFace
|
7 |
+
from langchain.vectorstores import FAISS
|
8 |
+
from langchain.chat_models import ChatOpenAI # For LLM
|
9 |
+
from langchain.memory import ConversationBufferMemory
|
10 |
+
from langchain.chains import ConversationalRetrievalChain
|
11 |
+
from htmlTemplates import css, bot_template, user_template
|
12 |
+
|
13 |
+
# Function to extract text from PDF documents
|
14 |
+
def get_pdf_text(pdf_docs):
|
15 |
+
text = ""
|
16 |
+
for pdf in pdf_docs:
|
17 |
+
pdf_reader = PdfReader(pdf)
|
18 |
+
for page in pdf_reader.pages:
|
19 |
+
extracted_text = page.extract_text()
|
20 |
+
if extracted_text:
|
21 |
+
text += extracted_text
|
22 |
+
return text
|
23 |
+
|
24 |
+
# Function to split text into manageable chunks
|
25 |
+
def get_text_chunks(text):
|
26 |
+
text_splitter = CharacterTextSplitter(
|
27 |
+
separator="\n",
|
28 |
+
chunk_size=1000,
|
29 |
+
chunk_overlap=200,
|
30 |
+
length_function=len
|
31 |
+
)
|
32 |
+
chunks = text_splitter.split_text(text)
|
33 |
+
return chunks
|
34 |
+
|
35 |
+
# Function to create a vector store using HuggingFace embeddings
|
36 |
+
def get_vectorstore(text_chunks, huggingface_api_key):
|
37 |
+
embeddings = HuggingFaceEmbeddings(
|
38 |
+
model_name="sentence-transformers/all-MiniLM-L6-v2", # Choose an appropriate model
|
39 |
+
model_kwargs={"use_auth_token": huggingface_api_key}
|
40 |
+
)
|
41 |
+
|
42 |
+
vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
|
43 |
+
return vectorstore
|
44 |
+
|
45 |
+
# Function to initialize the conversational retrieval chain with GrokAI
|
46 |
+
def get_conversation_chain(vectorstore, grok_api_key, grok_api_base):
|
47 |
+
llm = ChatOpenAI(
|
48 |
+
openai_api_key=grok_api_key,
|
49 |
+
openai_api_base=grok_api_base,
|
50 |
+
model_name="grok-beta", # Specify GrokAI's model
|
51 |
+
temperature=0.5
|
52 |
+
)
|
53 |
+
|
54 |
+
memory = ConversationBufferMemory(
|
55 |
+
memory_key='chat_history', return_messages=True
|
56 |
+
)
|
57 |
+
conversation_chain = ConversationalRetrievalChain.from_llm(
|
58 |
+
llm=llm, # Use the configured GrokAI LLM
|
59 |
+
retriever=vectorstore.as_retriever(),
|
60 |
+
memory=memory
|
61 |
+
)
|
62 |
+
return conversation_chain
|
63 |
+
|
64 |
+
# Function to handle user input and generate responses
|
65 |
+
def handle_userinput(user_question):
|
66 |
+
if st.session_state.conversation is None:
|
67 |
+
st.warning("Documents are still being processed. Please wait.")
|
68 |
+
return
|
69 |
+
|
70 |
+
response = st.session_state.conversation({'question': user_question})
|
71 |
+
st.session_state.chat_history = response['chat_history']
|
72 |
+
|
73 |
+
# Function triggered when the user presses Enter in the input box
|
74 |
+
def on_enter():
|
75 |
+
user_question = st.session_state.user_question
|
76 |
+
if user_question:
|
77 |
+
handle_userinput(user_question)
|
78 |
+
st.session_state.user_question = "" # Clear the input box
|
79 |
+
|
80 |
+
# Function to load and process PDF documents
|
81 |
+
def load_and_process_pdfs(folder_path, huggingface_api_key, grok_api_key, grok_api_base):
|
82 |
+
pdf_files = [file for file in os.listdir(folder_path) if file.lower().endswith('.pdf')]
|
83 |
+
if not pdf_files:
|
84 |
+
st.error(f"No PDF files found in the directory: {folder_path}")
|
85 |
+
return
|
86 |
+
|
87 |
+
pdf_docs = []
|
88 |
+
for file in pdf_files:
|
89 |
+
file_path = os.path.join(folder_path, file)
|
90 |
+
pdf_docs.append(file_path)
|
91 |
+
|
92 |
+
with st.spinner("Processing documents..."):
|
93 |
+
# Extract text from PDFs
|
94 |
+
with st.spinner("Extracting text from PDFs..."):
|
95 |
+
pdf_file_objects = [open(file, 'rb') for file in pdf_docs]
|
96 |
+
raw_text = get_pdf_text(pdf_file_objects)
|
97 |
+
# Close the files after reading
|
98 |
+
for f in pdf_file_objects:
|
99 |
+
f.close()
|
100 |
+
|
101 |
+
# Split text into chunks
|
102 |
+
with st.spinner("Splitting text into chunks..."):
|
103 |
+
text_chunks = get_text_chunks(raw_text)
|
104 |
+
|
105 |
+
# Create vector store using HuggingFace embeddings
|
106 |
+
with st.spinner("Creating vector store..."):
|
107 |
+
vectorstore = get_vectorstore(text_chunks, huggingface_api_key)
|
108 |
+
|
109 |
+
# Initialize conversation chain with GrokAI LLM
|
110 |
+
with st.spinner("Initializing conversation chain..."):
|
111 |
+
st.session_state.conversation = get_conversation_chain(vectorstore, grok_api_key, grok_api_base)
|
112 |
+
|
113 |
+
st.success("Documents processed successfully!")
|
114 |
+
|
115 |
+
# Function to display chat history with auto-scrolling
|
116 |
+
def display_chat_history():
|
117 |
+
if st.session_state.chat_history:
|
118 |
+
for i, message in enumerate(st.session_state.chat_history):
|
119 |
+
if i % 2 == 0:
|
120 |
+
st.markdown(user_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
|
121 |
+
else:
|
122 |
+
st.markdown(bot_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
|
123 |
+
|
124 |
+
# Inject JavaScript to scroll the entire page to the bottom
|
125 |
+
scroll_script = """
|
126 |
+
<script>
|
127 |
+
// Function to scroll to the bottom of the page
|
128 |
+
function scrollToBottom() {
|
129 |
+
window.scrollTo({ top: document.body.scrollHeight, behavior: 'smooth' });
|
130 |
+
}
|
131 |
+
// Delay to ensure the DOM is fully rendered
|
132 |
+
setTimeout(scrollToBottom, 100);
|
133 |
+
</script>
|
134 |
+
"""
|
135 |
+
st.markdown(scroll_script, unsafe_allow_html=True)
|
136 |
+
|
137 |
+
# Main function to run the Streamlit app
|
138 |
+
def main():
|
139 |
+
load_dotenv()
|
140 |
+
|
141 |
+
# Retrieve credentials from .env
|
142 |
+
grok_api_key = os.getenv("GROK_API_KEY")
|
143 |
+
grok_api_base = "https://api.x.ai/v1" # GrokAI's API base URL
|
144 |
+
huggingface_api_key = os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
145 |
+
|
146 |
+
st.set_page_config(page_title="Chat with AI Tax Agent", page_icon=":books:")
|
147 |
+
st.write(css, unsafe_allow_html=True)
|
148 |
+
|
149 |
+
if "conversation" not in st.session_state:
|
150 |
+
st.session_state.conversation = None
|
151 |
+
if "chat_history" not in st.session_state:
|
152 |
+
st.session_state.chat_history = []
|
153 |
+
|
154 |
+
# Title Section
|
155 |
+
st.header("Chat with AI Tax Agent :books:")
|
156 |
+
|
157 |
+
# Automatically load and process PDFs on startup
|
158 |
+
if st.session_state.conversation is None:
|
159 |
+
documents_folder = "./documents/" # Specify your documents folder path here
|
160 |
+
load_and_process_pdfs(documents_folder, huggingface_api_key, grok_api_key, grok_api_base)
|
161 |
+
|
162 |
+
# Chat History Section
|
163 |
+
display_chat_history()
|
164 |
+
|
165 |
+
# Input Box Section
|
166 |
+
st.text_input(
|
167 |
+
"Ask a question about your documents:",
|
168 |
+
key='user_question',
|
169 |
+
on_change=on_enter
|
170 |
+
)
|
171 |
+
|
172 |
+
if __name__ == '__main__':
|
173 |
+
main()
|
docs/PDF-LangChain.jpg
ADDED
![]() |
documents/taxagentknowledgebase.pdf
ADDED
Binary file (37.3 kB). View file
|
|
htmlTemplates.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
css = '''
|
2 |
+
<style>
|
3 |
+
.chat-message {
|
4 |
+
padding: 1.5rem;
|
5 |
+
border-radius: 0.5rem;
|
6 |
+
margin-bottom: 1rem;
|
7 |
+
display: flex;
|
8 |
+
}
|
9 |
+
.chat-message.user {
|
10 |
+
background-color: #2b313e;
|
11 |
+
}
|
12 |
+
.chat-message.bot {
|
13 |
+
background-color: #475063;
|
14 |
+
}
|
15 |
+
.chat-message .avatar {
|
16 |
+
width: 20%;
|
17 |
+
}
|
18 |
+
.chat-message .avatar img {
|
19 |
+
max-width: 78px;
|
20 |
+
max-height: 78px;
|
21 |
+
border-radius: 50%;
|
22 |
+
object-fit: cover;
|
23 |
+
}
|
24 |
+
.chat-message .message {
|
25 |
+
width: 80%;
|
26 |
+
padding: 0 1.5rem;
|
27 |
+
color: #fff;
|
28 |
+
}
|
29 |
+
</style>
|
30 |
+
'''
|
31 |
+
|
32 |
+
bot_template = '''
|
33 |
+
<div class="chat-message bot">
|
34 |
+
<div class="avatar">
|
35 |
+
<img src="https://i.ibb.co/cN0nmSj/Screenshot-2023-05-28-at-02-37-21.png" style="max-height: 78px; max-width: 78px; border-radius: 50%; object-fit: cover;">
|
36 |
+
</div>
|
37 |
+
<div class="message">{{MSG}}</div>
|
38 |
+
</div>
|
39 |
+
'''
|
40 |
+
|
41 |
+
user_template = '''
|
42 |
+
<div class="chat-message user">
|
43 |
+
<div class="avatar">
|
44 |
+
<img src="https://i.ibb.co/rdZC7LZ/Photo-logo-1.png">
|
45 |
+
</div>
|
46 |
+
<div class="message">{{MSG}}</div>
|
47 |
+
</div>
|
48 |
+
'''
|
new.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# just a simple code example of grok-ai api usage
|
2 |
+
|
3 |
+
import os
|
4 |
+
|
5 |
+
# pip install openai
|
6 |
+
from openai import OpenAI
|
7 |
+
|
8 |
+
# Set your API key here
|
9 |
+
grokapi = "xai-XXXXXXXXXXXXXXXXXX"
|
10 |
+
|
11 |
+
# Make a call to function using openai library
|
12 |
+
client = OpenAI(api_key=grokapi, base_url="https://api.x.ai/v1")
|
13 |
+
|
14 |
+
completion = client.chat.completions.create(
|
15 |
+
model="grok-beta",
|
16 |
+
messages=[
|
17 |
+
{"role": "system", "content": "You are grok, an openai"},
|
18 |
+
{"role": "user", "content": "How does X algorithm work?"}
|
19 |
+
]
|
20 |
+
)
|
21 |
+
|
22 |
+
print(completion.choices[0].message)
|
readme.md
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MultiPDF Chat App
|
2 |
+
|
3 |
+
## Introduction
|
4 |
+
|
5 |
+
---
|
6 |
+
|
7 |
+
The MultiPDF Chat App is a Python application that allows you to chat with multiple PDF documents. You can ask questions about the PDFs using natural language, and the application will provide relevant responses based on the content of the documents. This app utilizes a language model to generate accurate answers to your queries. Please note that the app will only respond to questions related to the loaded PDFs.
|
8 |
+
|
9 |
+
## How to Run the Program
|
10 |
+
|
11 |
+
To set up and run the program, follow these steps:
|
12 |
+
|
13 |
+
1. Create a virtual environment:
|
14 |
+
|
15 |
+
```bash
|
16 |
+
python -m venv venv
|
17 |
+
```
|
18 |
+
|
19 |
+
2. Activate the virtual environment:
|
20 |
+
|
21 |
+
- On Windows:
|
22 |
+
```bash
|
23 |
+
venv\Scripts\activate
|
24 |
+
```
|
25 |
+
- On macOS/Linux:
|
26 |
+
```bash
|
27 |
+
source venv/bin/activate
|
28 |
+
```
|
29 |
+
|
30 |
+
3. Install the required dependencies
|
31 |
+
```bash
|
32 |
+
pip install -r requirements.txt
|
33 |
+
```
|
34 |
+
4. Obtain an API key from GrokAI and add it to the `.env` file in the project directory.
|
35 |
+
|
36 |
+
```commandline
|
37 |
+
GROK_API_KEY=your_secrit_api_key
|
38 |
+
```
|
39 |
+
|
40 |
+
5. Run the application:
|
41 |
+
```bash
|
42 |
+
streamlit run app.py
|
43 |
+
```
|
44 |
+
|
45 |
+
Once completed, the app will open in your default web browser. If it doesn't, check the terminal for the link (usually `http://localhost:8501`).
|
46 |
+
|
47 |
+
## How It Works
|
48 |
+
|
49 |
+
---
|
50 |
+
|
51 |
+

|
52 |
+
|
53 |
+
The application follows these steps to provide responses to your questions:
|
54 |
+
|
55 |
+
1. PDF Loading: The app reads multiple PDF documents and extracts their text content.
|
56 |
+
|
57 |
+
2. Text Chunking: The extracted text is divided into smaller chunks that can be processed effectively.
|
58 |
+
|
59 |
+
3. Language Model: The application utilizes a language model to generate vector representations (embeddings) of the text chunks.
|
60 |
+
|
61 |
+
4. Similarity Matching: When you ask a question, the app compares it with the text chunks and identifies the most semantically similar ones.
|
62 |
+
|
63 |
+
5. Response Generation: The selected chunks are passed to the language model, which generates a response based on the relevant content of the PDFs.
|
64 |
+
|
65 |
+
## License
|
66 |
+
|
67 |
+
---
|
68 |
+
|
69 |
+
The MultiPDF Chat App is released under the [MIT License](https://opensource.org/licenses/MIT).
|
requirements.txt
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
langchain
|
2 |
+
PyPDF2
|
3 |
+
python-dotenv
|
4 |
+
streamlit
|
5 |
+
openai
|
6 |
+
faiss-cpu
|
7 |
+
altair
|
8 |
+
tiktoken
|
9 |
+
transformers
|
10 |
+
huggingface-hub
|
11 |
+
InstructorEmbedding
|
12 |
+
sentence-transformers
|
13 |
+
langchain-community
|