Spaces:
Sleeping
Sleeping
Upload 13 files
Browse files- LICENSE +21 -0
- README.md +0 -12
- app.py +169 -0
- pyproject.toml +12 -0
- requirements.txt +28 -0
- setup.py +34 -0
- src/Bot/__init__.py +0 -0
- src/Bot/__pycache__/__init__.cpython-310.pyc +0 -0
- src/Bot/__pycache__/logger.cpython-310.pyc +0 -0
- src/Bot/exception.py +21 -0
- src/Bot/logger.py +18 -0
- src/Bot/utils/__init__.py +58 -0
- src/Bot/utils/__pycache__/__init__.cpython-310.pyc +0 -0
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2023 Sachin Sen
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
CHANGED
@@ -1,12 +0,0 @@
|
|
1 |
-
---
|
2 |
-
title: Data Extracter
|
3 |
-
emoji: 🦀
|
4 |
-
colorFrom: pink
|
5 |
-
colorTo: blue
|
6 |
-
sdk: streamlit
|
7 |
-
sdk_version: 1.38.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
---
|
11 |
-
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
ADDED
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from src.Bot.utils import OCR
|
3 |
+
import time
|
4 |
+
import os
|
5 |
+
import gc
|
6 |
+
from langchain_community.docstore.in_memory import InMemoryDocstore
|
7 |
+
from langchain_community.vectorstores import FAISS
|
8 |
+
import shutil
|
9 |
+
from dotenv import load_dotenv
|
10 |
+
from langchain.chains.combine_documents import create_stuff_documents_chain
|
11 |
+
from langchain.retrievers import SelfQueryRetriever
|
12 |
+
from langchain_groq import ChatGroq
|
13 |
+
from langchain_core.runnables.history import RunnableWithMessageHistory
|
14 |
+
from langchain_core.prompts import ChatPromptTemplate
|
15 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
16 |
+
from langchain_chroma import Chroma
|
17 |
+
from langchain.chains.retrieval import create_retrieval_chain
|
18 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
19 |
+
from langchain_community.document_loaders import PyMuPDFLoader
|
20 |
+
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
|
21 |
+
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
|
22 |
+
from langchain_community.chat_message_histories import ChatMessageHistory
|
23 |
+
from langchain_core.chat_history import BaseChatMessageHistory
|
24 |
+
|
25 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
26 |
+
|
27 |
+
st.title("Conversational RAG with PDF uploads, OCR, and Chat History")
|
28 |
+
st.write("Upload PDFs, perform OCR, and chat with their content")
|
29 |
+
|
30 |
+
st.header("Enter API Keys")
|
31 |
+
if "groq_api_key" not in st.session_state:
|
32 |
+
st.session_state["groq_api_key"] = None
|
33 |
+
if "hf_token" not in st.session_state:
|
34 |
+
st.session_state["hf_token"] = None
|
35 |
+
|
36 |
+
if "pdf" not in st.session_state:
|
37 |
+
st.session_state["pdf"] = False
|
38 |
+
|
39 |
+
if "chat_button" not in st.session_state:
|
40 |
+
st.session_state["chat_button"] = False
|
41 |
+
|
42 |
+
if "default_question" not in st.session_state:
|
43 |
+
st.session_state["default_question"] = False
|
44 |
+
|
45 |
+
if "vectorstore" not in st.session_state:
|
46 |
+
st.session_state["vectorstore"] = None
|
47 |
+
|
48 |
+
# Input for GROQ API and Hugging Face API
|
49 |
+
groq_api_key = st.text_input("Enter your GROQ API Key", type="password")
|
50 |
+
hf_token = st.text_input("Enter your Hugging Face API Key", type="password")
|
51 |
+
|
52 |
+
if st.button("Submit API Keys"):
|
53 |
+
st.session_state["groq_api_key"] = groq_api_key
|
54 |
+
st.session_state["hf_token"] = hf_token
|
55 |
+
st.success("API keys submitted successfully!")
|
56 |
+
|
57 |
+
if st.session_state["groq_api_key"] and st.session_state["hf_token"]:
|
58 |
+
os.environ["GROQ_API_KEY"] = st.session_state["groq_api_key"]
|
59 |
+
os.environ['HF_TOKEN'] = st.session_state["hf_token"]
|
60 |
+
|
61 |
+
llm = ChatGroq(groq_api_key=st.session_state["groq_api_key"], model_name="Gemma2-9b-It")
|
62 |
+
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
|
63 |
+
|
64 |
+
st.write("API Keys are set. You can now upload a PDF and start working.")
|
65 |
+
|
66 |
+
file_upload = st.sidebar.file_uploader("Upload your PDF", type="pdf")
|
67 |
+
|
68 |
+
if file_upload:
|
69 |
+
input_pdf_path = os.path.join(os.getcwd(), "uploaded_file.pdf")
|
70 |
+
with open(input_pdf_path, "wb") as f:
|
71 |
+
f.write(file_upload.getvalue())
|
72 |
+
|
73 |
+
ocr_button = st.sidebar.button("OCR")
|
74 |
+
if ocr_button:
|
75 |
+
ocr = OCR(input_pdf_path)
|
76 |
+
output_file_path = ocr.do_ocr()
|
77 |
+
|
78 |
+
st.session_state.pdf = True
|
79 |
+
st.write(output_file_path)
|
80 |
+
|
81 |
+
# Clear existing Chroma DB instance
|
82 |
+
#st.session_state.vectorstore = None
|
83 |
+
|
84 |
+
chat_button = st.sidebar.button("Chat")
|
85 |
+
|
86 |
+
if chat_button:
|
87 |
+
st.session_state.chat_button = True
|
88 |
+
|
89 |
+
clear_history = st.sidebar.button("Clear History")
|
90 |
+
|
91 |
+
if clear_history:
|
92 |
+
|
93 |
+
st.session_state.vectorstore = None # Ensure Chroma DB is cleared
|
94 |
+
st.session_state["pdf"] = False
|
95 |
+
st.session_state["chat_button"] = False
|
96 |
+
st.session_state["default_question"] = False
|
97 |
+
st.session_state["vectorstore"] = None
|
98 |
+
st.write("History cleared and Chroma DB removed from memory.")
|
99 |
+
|
100 |
+
if st.session_state.pdf and st.session_state.chat_button:
|
101 |
+
default_output_dir = os.path.join(os.getcwd(), "output")
|
102 |
+
os.makedirs(default_output_dir, exist_ok=True)
|
103 |
+
output = os.path.join(default_output_dir, "output.pdf")
|
104 |
+
loader = PyMuPDFLoader(output)
|
105 |
+
documents = loader.load()
|
106 |
+
|
107 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=500)
|
108 |
+
splits = text_splitter.split_documents(documents)
|
109 |
+
|
110 |
+
# Create a new Chroma instance
|
111 |
+
st.session_state.vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)
|
112 |
+
retriever = st.session_state.vectorstore.as_retriever()
|
113 |
+
|
114 |
+
system_prompt = (
|
115 |
+
"""You are an intelligent assistant tasked with extracting specific details from a document. I will list the fields I need information on, and you should provide the answers based on the document content. Please extract and format the answers clearly for each of the following fields:
|
116 |
+
|
117 |
+
Please provide the answers in the same order, clearly labeling each field.
|
118 |
+
"\n\n"
|
119 |
+
"{context}"
|
120 |
+
"""
|
121 |
+
)
|
122 |
+
|
123 |
+
prompt = ChatPromptTemplate.from_messages(
|
124 |
+
[
|
125 |
+
("system", system_prompt),
|
126 |
+
("human", "{input}"),
|
127 |
+
]
|
128 |
+
)
|
129 |
+
|
130 |
+
question_answer_chain = create_stuff_documents_chain(llm, prompt)
|
131 |
+
rag_chain = create_retrieval_chain(retriever, question_answer_chain)
|
132 |
+
|
133 |
+
if "default_question" not in st.session_state:
|
134 |
+
st.session_state.default_question = False
|
135 |
+
|
136 |
+
default_questions = st.button("Ask Default Questions to PDF")
|
137 |
+
|
138 |
+
question = ["who is second party give its details",
|
139 |
+
"who is first party", "carpet Area and builtup area", "Rent escalation details",
|
140 |
+
"Transaction Type (sale/ Lease)", "Registry Date", "Document or registration Number",
|
141 |
+
"Village name", "Transaction Based on (Builtup or Carpet if both are given consider Builtup)",
|
142 |
+
"Stamp Duty if Given", "Total Car Parking", "Refund of Interest Fee",
|
143 |
+
"escalation chart with start date, end date, rent per sqft after escalation percentage by calculation",
|
144 |
+
"Car parking Charges", "Cam charges per Square feet", "rent per in 1st year",
|
145 |
+
"rent value", "Lease start date", "Lease End date calculate by start date if not given",
|
146 |
+
"lock in period in months if given if not give 'NA'", "Notice Period in days or months",
|
147 |
+
"Location and Floor of area leased to second party", "security Deposit amount"]
|
148 |
+
|
149 |
+
if default_questions:
|
150 |
+
resp = []
|
151 |
+
for i in question:
|
152 |
+
response = rag_chain.invoke({"input": i})
|
153 |
+
if 'answer' in response:
|
154 |
+
resp.append(f"{i} : {response['answer']}")
|
155 |
+
else:
|
156 |
+
resp.append(f"{i} : No answer found")
|
157 |
+
time.sleep(1)
|
158 |
+
st.write("Default questions are selected")
|
159 |
+
st.write(resp)
|
160 |
+
|
161 |
+
user_input = st.text_input("Your question:")
|
162 |
+
if user_input:
|
163 |
+
response = rag_chain.invoke({"input": user_input})
|
164 |
+
st.write("Assistant:", response['answer'])
|
165 |
+
|
166 |
+
else:
|
167 |
+
st.write("<--- Run OCR FIRST")
|
168 |
+
else:
|
169 |
+
st.warning("Please enter your API keys to proceed.")
|
pyproject.toml
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[build-system]
|
2 |
+
requires = ['setuptools>=42.0', "wheel"]
|
3 |
+
build-backend = "setuptools.build_meta"
|
4 |
+
|
5 |
+
[tool.pytest.ini_options]
|
6 |
+
testpaths = [
|
7 |
+
"tests"
|
8 |
+
]
|
9 |
+
|
10 |
+
[tool.mypy]
|
11 |
+
mypy_path = "src"
|
12 |
+
ignore_missing_imports = true
|
requirements.txt
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ipykernel
|
2 |
+
Langchain
|
3 |
+
python-dotenv
|
4 |
+
langchain-community
|
5 |
+
langchain_groq
|
6 |
+
#langchain-google
|
7 |
+
bs4
|
8 |
+
langchain-core
|
9 |
+
faiss-cpu
|
10 |
+
sentence-transformers
|
11 |
+
#PyPDF
|
12 |
+
fastapi
|
13 |
+
uvicorn
|
14 |
+
langserve
|
15 |
+
langchain-chroma
|
16 |
+
langchain-huggingface
|
17 |
+
wikipedia
|
18 |
+
arxiv
|
19 |
+
duckduckgo-search
|
20 |
+
nltk
|
21 |
+
unstructured
|
22 |
+
youtube-transcript-api
|
23 |
+
langserve
|
24 |
+
tesseract
|
25 |
+
pytesseract
|
26 |
+
streamlit
|
27 |
+
ocrmypdf
|
28 |
+
gs
|
setup.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import setuptools
|
2 |
+
|
3 |
+
_version_ = '0.0.1'
|
4 |
+
REPO_NAME = 'BOT'
|
5 |
+
AUTHOR_NAME = 'Sachinsen1295'
|
6 |
+
SOURCE_REPO = "BOT"
|
7 |
+
AUTHOR_EMAIL = "[email protected]"
|
8 |
+
|
9 |
+
|
10 |
+
with open("README.md", "r", encoding="utf-8") as f:
|
11 |
+
LONG_DESCRIPTION=f.read()
|
12 |
+
|
13 |
+
with open("LICENSE", 'r') as L:
|
14 |
+
LICENSE = L.read()
|
15 |
+
|
16 |
+
setuptools.setup(
|
17 |
+
name=SOURCE_REPO,
|
18 |
+
version=_version_,
|
19 |
+
author=AUTHOR_NAME,
|
20 |
+
author_email=AUTHOR_EMAIL,
|
21 |
+
description="This is my Deed extraction Bot",
|
22 |
+
long_description=LONG_DESCRIPTION,
|
23 |
+
long_description_content = "text/markdown",
|
24 |
+
url=f"https://github.com/{AUTHOR_NAME}/{REPO_NAME}",
|
25 |
+
license = LICENSE,
|
26 |
+
|
27 |
+
project_urls={
|
28 |
+
|
29 |
+
"Bug Tracker": f"https://github.com/{AUTHOR_NAME}/{REPO_NAME}/issues",
|
30 |
+
|
31 |
+
},
|
32 |
+
package_dir={"":"src"},
|
33 |
+
packages=setuptools.find_packages(where="src")
|
34 |
+
)
|
src/Bot/__init__.py
ADDED
File without changes
|
src/Bot/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (175 Bytes). View file
|
|
src/Bot/__pycache__/logger.cpython-310.pyc
ADDED
Binary file (578 Bytes). View file
|
|
src/Bot/exception.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import os
|
3 |
+
from Bot.logger import logger
|
4 |
+
|
5 |
+
def error_message_detail(error,error_detail:sys):
|
6 |
+
_,_,exc_tb=error_detail.exc_info()
|
7 |
+
file_name=exc_tb.tb_frame.f_code.co_filename
|
8 |
+
error_message="Error occured in python script name [{0}] line number [{1}] error message[{2}]".format(
|
9 |
+
file_name,exc_tb.tb_lineno,str(error))
|
10 |
+
|
11 |
+
return error_message
|
12 |
+
|
13 |
+
|
14 |
+
|
15 |
+
class CustomException(Exception):
|
16 |
+
def __init__(self,error_message,error_detail:sys):
|
17 |
+
super().__init__(error_message)
|
18 |
+
self.error_message=error_message_detail(error_message,error_detail=error_detail)
|
19 |
+
|
20 |
+
def __str__(self):
|
21 |
+
return self.error_message
|
src/Bot/logger.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os,sys
|
3 |
+
|
4 |
+
logging_str="[%(asctime)s: %(levelname)s: %(module)s]: %(message)s"
|
5 |
+
|
6 |
+
log_dir = "logs"
|
7 |
+
|
8 |
+
log_file_path = os.path.join(log_dir,"running_logs.log")
|
9 |
+
|
10 |
+
os.makedirs(log_dir, exist_ok=True)
|
11 |
+
|
12 |
+
logging.basicConfig(level=logging.INFO, format=logging_str,
|
13 |
+
handlers=[
|
14 |
+
logging.FileHandler(log_file_path),
|
15 |
+
logging.StreamHandler(sys.stdout) # to display the logger messages in command prompt
|
16 |
+
])
|
17 |
+
|
18 |
+
logger = logging.getLogger("Bot")
|
src/Bot/utils/__init__.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# import ocrmypdf
|
2 |
+
# from src.Bot.logger import logging
|
3 |
+
|
4 |
+
|
5 |
+
# class OCR:
|
6 |
+
# def __init__(self,input, output):
|
7 |
+
# self.input = input
|
8 |
+
# self.output = output
|
9 |
+
|
10 |
+
# def do_ocr(self):
|
11 |
+
# ocrmypdf.ocr(self.input, output_file=self.output)
|
12 |
+
# return self.output
|
13 |
+
|
14 |
+
|
15 |
+
|
16 |
+
import ocrmypdf
|
17 |
+
from src.Bot.logger import logging
|
18 |
+
import os
|
19 |
+
|
20 |
+
class OCR:
|
21 |
+
def __init__(self, input, output=None):
|
22 |
+
self.input = input
|
23 |
+
# Set default output path if none is provided
|
24 |
+
if output is None:
|
25 |
+
default_output_dir = os.path.join(os.getcwd(), "output") # Default directory for output files
|
26 |
+
os.makedirs(default_output_dir, exist_ok=True) # Create the directory if it doesn't exist
|
27 |
+
self.output = os.path.join(default_output_dir, "output.pdf") # Default output file path
|
28 |
+
else:
|
29 |
+
self.output = output
|
30 |
+
|
31 |
+
def do_ocr(self):
|
32 |
+
ocrmypdf.ocr(self.input, output_file=self.output,force_ocr=True,)
|
33 |
+
return self.output
|
34 |
+
|
35 |
+
|
36 |
+
# Function to reset the FAISS index (clear vectors)
|
37 |
+
def reset_faiss_index(vector_store):
|
38 |
+
"""Clear all vectors from the FAISS index."""
|
39 |
+
if isinstance(vector_store.index, faiss.Index):
|
40 |
+
vector_store.index.reset()
|
41 |
+
print("FAISS index has been reset (vectors cleared).")
|
42 |
+
else:
|
43 |
+
print("No FAISS index found.")
|
44 |
+
|
45 |
+
# Function to delete the FAISS index (remove from memory)
|
46 |
+
def delete_faiss_index(vector_store):
|
47 |
+
"""Delete the FAISS index and free up memory."""
|
48 |
+
if isinstance(vector_store.index, faiss.Index):
|
49 |
+
del vector_store.index
|
50 |
+
vector_store.index = None # Set to None to avoid further access
|
51 |
+
gc.collect() # Ensure memory is freed
|
52 |
+
print("FAISS index deleted and memory cleared.")
|
53 |
+
else:
|
54 |
+
print("No FAISS index found.")
|
55 |
+
|
56 |
+
|
57 |
+
|
58 |
+
|
src/Bot/utils/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (1.56 kB). View file
|
|