Athsara commited on
Commit
e7afd7f
·
verified ·
1 Parent(s): 7b744b1

Upload 12 files

Browse files
.env ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ OPENAI_API_KEY=""
2
+ HUGGINGFACEHUB_API_TOKEN=""
3
+ GROK_API_KEY="xai-tHIoTBXBRStLAx2AXugmiP8yEIQ1uJNw5ypsc77pQ8msbG9jz9F3W705y8YJM7AJM7dQlf0fydWtSHrA"
.env.example ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ OPENAI_API_KEY=
2
+ HUGGINGFACEHUB_API_TOKEN=
.gitignore ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ .idea
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.9
__pycache__/htmlTemplates.cpython-312.pyc ADDED
Binary file (1.16 kB). View file
 
app.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ from dotenv import load_dotenv
4
+ from PyPDF2 import PdfReader
5
+ from langchain.text_splitter import CharacterTextSplitter
6
+ from langchain.embeddings import HuggingFaceEmbeddings # Changed to HuggingFace
7
+ from langchain.vectorstores import FAISS
8
+ from langchain.chat_models import ChatOpenAI # For LLM
9
+ from langchain.memory import ConversationBufferMemory
10
+ from langchain.chains import ConversationalRetrievalChain
11
+ from htmlTemplates import css, bot_template, user_template
12
+
13
+ # Function to extract text from PDF documents
14
+ def get_pdf_text(pdf_docs):
15
+ text = ""
16
+ for pdf in pdf_docs:
17
+ pdf_reader = PdfReader(pdf)
18
+ for page in pdf_reader.pages:
19
+ extracted_text = page.extract_text()
20
+ if extracted_text:
21
+ text += extracted_text
22
+ return text
23
+
24
+ # Function to split text into manageable chunks
25
+ def get_text_chunks(text):
26
+ text_splitter = CharacterTextSplitter(
27
+ separator="\n",
28
+ chunk_size=1000,
29
+ chunk_overlap=200,
30
+ length_function=len
31
+ )
32
+ chunks = text_splitter.split_text(text)
33
+ return chunks
34
+
35
+ # Function to create a vector store using HuggingFace embeddings
36
+ def get_vectorstore(text_chunks, huggingface_api_key):
37
+ embeddings = HuggingFaceEmbeddings(
38
+ model_name="sentence-transformers/all-MiniLM-L6-v2", # Choose an appropriate model
39
+ model_kwargs={"use_auth_token": huggingface_api_key}
40
+ )
41
+
42
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
43
+ return vectorstore
44
+
45
+ # Function to initialize the conversational retrieval chain with GrokAI
46
+ def get_conversation_chain(vectorstore, grok_api_key, grok_api_base):
47
+ llm = ChatOpenAI(
48
+ openai_api_key=grok_api_key,
49
+ openai_api_base=grok_api_base,
50
+ model_name="grok-beta", # Specify GrokAI's model
51
+ temperature=0.5
52
+ )
53
+
54
+ memory = ConversationBufferMemory(
55
+ memory_key='chat_history', return_messages=True
56
+ )
57
+ conversation_chain = ConversationalRetrievalChain.from_llm(
58
+ llm=llm, # Use the configured GrokAI LLM
59
+ retriever=vectorstore.as_retriever(),
60
+ memory=memory
61
+ )
62
+ return conversation_chain
63
+
64
+ # Function to handle user input and generate responses
65
+ def handle_userinput(user_question):
66
+ if st.session_state.conversation is None:
67
+ st.warning("Documents are still being processed. Please wait.")
68
+ return
69
+
70
+ response = st.session_state.conversation({'question': user_question})
71
+ st.session_state.chat_history = response['chat_history']
72
+
73
+ # Function triggered when the user presses Enter in the input box
74
+ def on_enter():
75
+ user_question = st.session_state.user_question
76
+ if user_question:
77
+ handle_userinput(user_question)
78
+ st.session_state.user_question = "" # Clear the input box
79
+
80
+ # Function to load and process PDF documents
81
+ def load_and_process_pdfs(folder_path, huggingface_api_key, grok_api_key, grok_api_base):
82
+ pdf_files = [file for file in os.listdir(folder_path) if file.lower().endswith('.pdf')]
83
+ if not pdf_files:
84
+ st.error(f"No PDF files found in the directory: {folder_path}")
85
+ return
86
+
87
+ pdf_docs = []
88
+ for file in pdf_files:
89
+ file_path = os.path.join(folder_path, file)
90
+ pdf_docs.append(file_path)
91
+
92
+ with st.spinner("Processing documents..."):
93
+ # Extract text from PDFs
94
+ with st.spinner("Extracting text from PDFs..."):
95
+ pdf_file_objects = [open(file, 'rb') for file in pdf_docs]
96
+ raw_text = get_pdf_text(pdf_file_objects)
97
+ # Close the files after reading
98
+ for f in pdf_file_objects:
99
+ f.close()
100
+
101
+ # Split text into chunks
102
+ with st.spinner("Splitting text into chunks..."):
103
+ text_chunks = get_text_chunks(raw_text)
104
+
105
+ # Create vector store using HuggingFace embeddings
106
+ with st.spinner("Creating vector store..."):
107
+ vectorstore = get_vectorstore(text_chunks, huggingface_api_key)
108
+
109
+ # Initialize conversation chain with GrokAI LLM
110
+ with st.spinner("Initializing conversation chain..."):
111
+ st.session_state.conversation = get_conversation_chain(vectorstore, grok_api_key, grok_api_base)
112
+
113
+ st.success("Documents processed successfully!")
114
+
115
+ # Function to display chat history with auto-scrolling
116
+ def display_chat_history():
117
+ if st.session_state.chat_history:
118
+ for i, message in enumerate(st.session_state.chat_history):
119
+ if i % 2 == 0:
120
+ st.markdown(user_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
121
+ else:
122
+ st.markdown(bot_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
123
+
124
+ # Inject JavaScript to scroll the entire page to the bottom
125
+ scroll_script = """
126
+ <script>
127
+ // Function to scroll to the bottom of the page
128
+ function scrollToBottom() {
129
+ window.scrollTo({ top: document.body.scrollHeight, behavior: 'smooth' });
130
+ }
131
+ // Delay to ensure the DOM is fully rendered
132
+ setTimeout(scrollToBottom, 100);
133
+ </script>
134
+ """
135
+ st.markdown(scroll_script, unsafe_allow_html=True)
136
+
137
+ # Main function to run the Streamlit app
138
+ def main():
139
+ load_dotenv()
140
+
141
+ # Retrieve credentials from .env
142
+ grok_api_key = os.getenv("GROK_API_KEY")
143
+ grok_api_base = "https://api.x.ai/v1" # GrokAI's API base URL
144
+ huggingface_api_key = os.getenv("HUGGINGFACEHUB_API_TOKEN")
145
+
146
+ st.set_page_config(page_title="Chat with AI Tax Agent", page_icon=":books:")
147
+ st.write(css, unsafe_allow_html=True)
148
+
149
+ if "conversation" not in st.session_state:
150
+ st.session_state.conversation = None
151
+ if "chat_history" not in st.session_state:
152
+ st.session_state.chat_history = []
153
+
154
+ # Title Section
155
+ st.header("Chat with AI Tax Agent :books:")
156
+
157
+ # Automatically load and process PDFs on startup
158
+ if st.session_state.conversation is None:
159
+ documents_folder = "./documents/" # Specify your documents folder path here
160
+ load_and_process_pdfs(documents_folder, huggingface_api_key, grok_api_key, grok_api_base)
161
+
162
+ # Chat History Section
163
+ display_chat_history()
164
+
165
+ # Input Box Section
166
+ st.text_input(
167
+ "Ask a question about your documents:",
168
+ key='user_question',
169
+ on_change=on_enter
170
+ )
171
+
172
+ if __name__ == '__main__':
173
+ main()
docs/PDF-LangChain.jpg ADDED
documents/taxagentknowledgebase.pdf ADDED
Binary file (37.3 kB). View file
 
htmlTemplates.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ css = '''
2
+ <style>
3
+ .chat-message {
4
+ padding: 1.5rem;
5
+ border-radius: 0.5rem;
6
+ margin-bottom: 1rem;
7
+ display: flex;
8
+ }
9
+ .chat-message.user {
10
+ background-color: #2b313e;
11
+ }
12
+ .chat-message.bot {
13
+ background-color: #475063;
14
+ }
15
+ .chat-message .avatar {
16
+ width: 20%;
17
+ }
18
+ .chat-message .avatar img {
19
+ max-width: 78px;
20
+ max-height: 78px;
21
+ border-radius: 50%;
22
+ object-fit: cover;
23
+ }
24
+ .chat-message .message {
25
+ width: 80%;
26
+ padding: 0 1.5rem;
27
+ color: #fff;
28
+ }
29
+ </style>
30
+ '''
31
+
32
+ bot_template = '''
33
+ <div class="chat-message bot">
34
+ <div class="avatar">
35
+ <img src="https://i.ibb.co/cN0nmSj/Screenshot-2023-05-28-at-02-37-21.png" style="max-height: 78px; max-width: 78px; border-radius: 50%; object-fit: cover;">
36
+ </div>
37
+ <div class="message">{{MSG}}</div>
38
+ </div>
39
+ '''
40
+
41
+ user_template = '''
42
+ <div class="chat-message user">
43
+ <div class="avatar">
44
+ <img src="https://i.ibb.co/rdZC7LZ/Photo-logo-1.png">
45
+ </div>
46
+ <div class="message">{{MSG}}</div>
47
+ </div>
48
+ '''
new.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # just a simple code example of grok-ai api usage
2
+
3
+ import os
4
+
5
+ # pip install openai
6
+ from openai import OpenAI
7
+
8
+ # Set your API key here
9
+ grokapi = "xai-XXXXXXXXXXXXXXXXXX"
10
+
11
+ # Make a call to function using openai library
12
+ client = OpenAI(api_key=grokapi, base_url="https://api.x.ai/v1")
13
+
14
+ completion = client.chat.completions.create(
15
+ model="grok-beta",
16
+ messages=[
17
+ {"role": "system", "content": "You are grok, an openai"},
18
+ {"role": "user", "content": "How does X algorithm work?"}
19
+ ]
20
+ )
21
+
22
+ print(completion.choices[0].message)
readme.md ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MultiPDF Chat App
2
+
3
+ ## Introduction
4
+
5
+ ---
6
+
7
+ The MultiPDF Chat App is a Python application that allows you to chat with multiple PDF documents. You can ask questions about the PDFs using natural language, and the application will provide relevant responses based on the content of the documents. This app utilizes a language model to generate accurate answers to your queries. Please note that the app will only respond to questions related to the loaded PDFs.
8
+
9
+ ## How to Run the Program
10
+
11
+ To set up and run the program, follow these steps:
12
+
13
+ 1. Create a virtual environment:
14
+
15
+ ```bash
16
+ python -m venv venv
17
+ ```
18
+
19
+ 2. Activate the virtual environment:
20
+
21
+ - On Windows:
22
+ ```bash
23
+ venv\Scripts\activate
24
+ ```
25
+ - On macOS/Linux:
26
+ ```bash
27
+ source venv/bin/activate
28
+ ```
29
+
30
+ 3. Install the required dependencies
31
+ ```bash
32
+ pip install -r requirements.txt
33
+ ```
34
+ 4. Obtain an API key from GrokAI and add it to the `.env` file in the project directory.
35
+
36
+ ```commandline
37
+ GROK_API_KEY=your_secrit_api_key
38
+ ```
39
+
40
+ 5. Run the application:
41
+ ```bash
42
+ streamlit run app.py
43
+ ```
44
+
45
+ Once completed, the app will open in your default web browser. If it doesn't, check the terminal for the link (usually `http://localhost:8501`).
46
+
47
+ ## How It Works
48
+
49
+ ---
50
+
51
+ ![MultiPDF Chat App Diagram](./docs/PDF-LangChain.jpg)
52
+
53
+ The application follows these steps to provide responses to your questions:
54
+
55
+ 1. PDF Loading: The app reads multiple PDF documents and extracts their text content.
56
+
57
+ 2. Text Chunking: The extracted text is divided into smaller chunks that can be processed effectively.
58
+
59
+ 3. Language Model: The application utilizes a language model to generate vector representations (embeddings) of the text chunks.
60
+
61
+ 4. Similarity Matching: When you ask a question, the app compares it with the text chunks and identifies the most semantically similar ones.
62
+
63
+ 5. Response Generation: The selected chunks are passed to the language model, which generates a response based on the relevant content of the PDFs.
64
+
65
+ ## License
66
+
67
+ ---
68
+
69
+ The MultiPDF Chat App is released under the [MIT License](https://opensource.org/licenses/MIT).
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ PyPDF2
3
+ python-dotenv
4
+ streamlit
5
+ openai
6
+ faiss-cpu
7
+ altair
8
+ tiktoken
9
+ transformers
10
+ huggingface-hub
11
+ InstructorEmbedding
12
+ sentence-transformers
13
+ langchain-community