sachinsen1295 commited on
Commit
4b0118c
·
verified ·
1 Parent(s): 3cbcfe2

Upload 13 files

Browse files
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Sachin Sen
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,12 +0,0 @@
1
- ---
2
- title: Data Extracter
3
- emoji: 🦀
4
- colorFrom: pink
5
- colorTo: blue
6
- sdk: streamlit
7
- sdk_version: 1.38.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from src.Bot.utils import OCR
3
+ import time
4
+ import os
5
+ import gc
6
+ from langchain_community.docstore.in_memory import InMemoryDocstore
7
+ from langchain_community.vectorstores import FAISS
8
+ import shutil
9
+ from dotenv import load_dotenv
10
+ from langchain.chains.combine_documents import create_stuff_documents_chain
11
+ from langchain.retrievers import SelfQueryRetriever
12
+ from langchain_groq import ChatGroq
13
+ from langchain_core.runnables.history import RunnableWithMessageHistory
14
+ from langchain_core.prompts import ChatPromptTemplate
15
+ from langchain_huggingface import HuggingFaceEmbeddings
16
+ from langchain_chroma import Chroma
17
+ from langchain.chains.retrieval import create_retrieval_chain
18
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
19
+ from langchain_community.document_loaders import PyMuPDFLoader
20
+ from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
21
+ from langchain.chains import create_history_aware_retriever, create_retrieval_chain
22
+ from langchain_community.chat_message_histories import ChatMessageHistory
23
+ from langchain_core.chat_history import BaseChatMessageHistory
24
+
25
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
26
+
27
+ st.title("Conversational RAG with PDF uploads, OCR, and Chat History")
28
+ st.write("Upload PDFs, perform OCR, and chat with their content")
29
+
30
+ st.header("Enter API Keys")
31
+ if "groq_api_key" not in st.session_state:
32
+ st.session_state["groq_api_key"] = None
33
+ if "hf_token" not in st.session_state:
34
+ st.session_state["hf_token"] = None
35
+
36
+ if "pdf" not in st.session_state:
37
+ st.session_state["pdf"] = False
38
+
39
+ if "chat_button" not in st.session_state:
40
+ st.session_state["chat_button"] = False
41
+
42
+ if "default_question" not in st.session_state:
43
+ st.session_state["default_question"] = False
44
+
45
+ if "vectorstore" not in st.session_state:
46
+ st.session_state["vectorstore"] = None
47
+
48
+ # Input for GROQ API and Hugging Face API
49
+ groq_api_key = st.text_input("Enter your GROQ API Key", type="password")
50
+ hf_token = st.text_input("Enter your Hugging Face API Key", type="password")
51
+
52
+ if st.button("Submit API Keys"):
53
+ st.session_state["groq_api_key"] = groq_api_key
54
+ st.session_state["hf_token"] = hf_token
55
+ st.success("API keys submitted successfully!")
56
+
57
+ if st.session_state["groq_api_key"] and st.session_state["hf_token"]:
58
+ os.environ["GROQ_API_KEY"] = st.session_state["groq_api_key"]
59
+ os.environ['HF_TOKEN'] = st.session_state["hf_token"]
60
+
61
+ llm = ChatGroq(groq_api_key=st.session_state["groq_api_key"], model_name="Gemma2-9b-It")
62
+ embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
63
+
64
+ st.write("API Keys are set. You can now upload a PDF and start working.")
65
+
66
+ file_upload = st.sidebar.file_uploader("Upload your PDF", type="pdf")
67
+
68
+ if file_upload:
69
+ input_pdf_path = os.path.join(os.getcwd(), "uploaded_file.pdf")
70
+ with open(input_pdf_path, "wb") as f:
71
+ f.write(file_upload.getvalue())
72
+
73
+ ocr_button = st.sidebar.button("OCR")
74
+ if ocr_button:
75
+ ocr = OCR(input_pdf_path)
76
+ output_file_path = ocr.do_ocr()
77
+
78
+ st.session_state.pdf = True
79
+ st.write(output_file_path)
80
+
81
+ # Clear existing Chroma DB instance
82
+ #st.session_state.vectorstore = None
83
+
84
+ chat_button = st.sidebar.button("Chat")
85
+
86
+ if chat_button:
87
+ st.session_state.chat_button = True
88
+
89
+ clear_history = st.sidebar.button("Clear History")
90
+
91
+ if clear_history:
92
+
93
+ st.session_state.vectorstore = None # Ensure Chroma DB is cleared
94
+ st.session_state["pdf"] = False
95
+ st.session_state["chat_button"] = False
96
+ st.session_state["default_question"] = False
97
+ st.session_state["vectorstore"] = None
98
+ st.write("History cleared and Chroma DB removed from memory.")
99
+
100
+ if st.session_state.pdf and st.session_state.chat_button:
101
+ default_output_dir = os.path.join(os.getcwd(), "output")
102
+ os.makedirs(default_output_dir, exist_ok=True)
103
+ output = os.path.join(default_output_dir, "output.pdf")
104
+ loader = PyMuPDFLoader(output)
105
+ documents = loader.load()
106
+
107
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=500)
108
+ splits = text_splitter.split_documents(documents)
109
+
110
+ # Create a new Chroma instance
111
+ st.session_state.vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)
112
+ retriever = st.session_state.vectorstore.as_retriever()
113
+
114
+ system_prompt = (
115
+ """You are an intelligent assistant tasked with extracting specific details from a document. I will list the fields I need information on, and you should provide the answers based on the document content. Please extract and format the answers clearly for each of the following fields:
116
+
117
+ Please provide the answers in the same order, clearly labeling each field.
118
+ "\n\n"
119
+ "{context}"
120
+ """
121
+ )
122
+
123
+ prompt = ChatPromptTemplate.from_messages(
124
+ [
125
+ ("system", system_prompt),
126
+ ("human", "{input}"),
127
+ ]
128
+ )
129
+
130
+ question_answer_chain = create_stuff_documents_chain(llm, prompt)
131
+ rag_chain = create_retrieval_chain(retriever, question_answer_chain)
132
+
133
+ if "default_question" not in st.session_state:
134
+ st.session_state.default_question = False
135
+
136
+ default_questions = st.button("Ask Default Questions to PDF")
137
+
138
+ question = ["who is second party give its details",
139
+ "who is first party", "carpet Area and builtup area", "Rent escalation details",
140
+ "Transaction Type (sale/ Lease)", "Registry Date", "Document or registration Number",
141
+ "Village name", "Transaction Based on (Builtup or Carpet if both are given consider Builtup)",
142
+ "Stamp Duty if Given", "Total Car Parking", "Refund of Interest Fee",
143
+ "escalation chart with start date, end date, rent per sqft after escalation percentage by calculation",
144
+ "Car parking Charges", "Cam charges per Square feet", "rent per in 1st year",
145
+ "rent value", "Lease start date", "Lease End date calculate by start date if not given",
146
+ "lock in period in months if given if not give 'NA'", "Notice Period in days or months",
147
+ "Location and Floor of area leased to second party", "security Deposit amount"]
148
+
149
+ if default_questions:
150
+ resp = []
151
+ for i in question:
152
+ response = rag_chain.invoke({"input": i})
153
+ if 'answer' in response:
154
+ resp.append(f"{i} : {response['answer']}")
155
+ else:
156
+ resp.append(f"{i} : No answer found")
157
+ time.sleep(1)
158
+ st.write("Default questions are selected")
159
+ st.write(resp)
160
+
161
+ user_input = st.text_input("Your question:")
162
+ if user_input:
163
+ response = rag_chain.invoke({"input": user_input})
164
+ st.write("Assistant:", response['answer'])
165
+
166
+ else:
167
+ st.write("<--- Run OCR FIRST")
168
+ else:
169
+ st.warning("Please enter your API keys to proceed.")
pyproject.toml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ['setuptools>=42.0', "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [tool.pytest.ini_options]
6
+ testpaths = [
7
+ "tests"
8
+ ]
9
+
10
+ [tool.mypy]
11
+ mypy_path = "src"
12
+ ignore_missing_imports = true
requirements.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ipykernel
2
+ Langchain
3
+ python-dotenv
4
+ langchain-community
5
+ langchain_groq
6
+ #langchain-google
7
+ bs4
8
+ langchain-core
9
+ faiss-cpu
10
+ sentence-transformers
11
+ #PyPDF
12
+ fastapi
13
+ uvicorn
14
+ langserve
15
+ langchain-chroma
16
+ langchain-huggingface
17
+ wikipedia
18
+ arxiv
19
+ duckduckgo-search
20
+ nltk
21
+ unstructured
22
+ youtube-transcript-api
23
+ langserve
24
+ tesseract
25
+ pytesseract
26
+ streamlit
27
+ ocrmypdf
28
+ gs
setup.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import setuptools
2
+
3
+ _version_ = '0.0.1'
4
+ REPO_NAME = 'BOT'
5
+ AUTHOR_NAME = 'Sachinsen1295'
6
+ SOURCE_REPO = "BOT"
7
+ AUTHOR_EMAIL = "[email protected]"
8
+
9
+
10
+ with open("README.md", "r", encoding="utf-8") as f:
11
+ LONG_DESCRIPTION=f.read()
12
+
13
+ with open("LICENSE", 'r') as L:
14
+ LICENSE = L.read()
15
+
16
+ setuptools.setup(
17
+ name=SOURCE_REPO,
18
+ version=_version_,
19
+ author=AUTHOR_NAME,
20
+ author_email=AUTHOR_EMAIL,
21
+ description="This is my Deed extraction Bot",
22
+ long_description=LONG_DESCRIPTION,
23
+ long_description_content = "text/markdown",
24
+ url=f"https://github.com/{AUTHOR_NAME}/{REPO_NAME}",
25
+ license = LICENSE,
26
+
27
+ project_urls={
28
+
29
+ "Bug Tracker": f"https://github.com/{AUTHOR_NAME}/{REPO_NAME}/issues",
30
+
31
+ },
32
+ package_dir={"":"src"},
33
+ packages=setuptools.find_packages(where="src")
34
+ )
src/Bot/__init__.py ADDED
File without changes
src/Bot/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (175 Bytes). View file
 
src/Bot/__pycache__/logger.cpython-310.pyc ADDED
Binary file (578 Bytes). View file
 
src/Bot/exception.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ from Bot.logger import logger
4
+
5
+ def error_message_detail(error,error_detail:sys):
6
+ _,_,exc_tb=error_detail.exc_info()
7
+ file_name=exc_tb.tb_frame.f_code.co_filename
8
+ error_message="Error occured in python script name [{0}] line number [{1}] error message[{2}]".format(
9
+ file_name,exc_tb.tb_lineno,str(error))
10
+
11
+ return error_message
12
+
13
+
14
+
15
+ class CustomException(Exception):
16
+ def __init__(self,error_message,error_detail:sys):
17
+ super().__init__(error_message)
18
+ self.error_message=error_message_detail(error_message,error_detail=error_detail)
19
+
20
+ def __str__(self):
21
+ return self.error_message
src/Bot/logger.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os,sys
3
+
4
+ logging_str="[%(asctime)s: %(levelname)s: %(module)s]: %(message)s"
5
+
6
+ log_dir = "logs"
7
+
8
+ log_file_path = os.path.join(log_dir,"running_logs.log")
9
+
10
+ os.makedirs(log_dir, exist_ok=True)
11
+
12
+ logging.basicConfig(level=logging.INFO, format=logging_str,
13
+ handlers=[
14
+ logging.FileHandler(log_file_path),
15
+ logging.StreamHandler(sys.stdout) # to display the logger messages in command prompt
16
+ ])
17
+
18
+ logger = logging.getLogger("Bot")
src/Bot/utils/__init__.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import ocrmypdf
2
+ # from src.Bot.logger import logging
3
+
4
+
5
+ # class OCR:
6
+ # def __init__(self,input, output):
7
+ # self.input = input
8
+ # self.output = output
9
+
10
+ # def do_ocr(self):
11
+ # ocrmypdf.ocr(self.input, output_file=self.output)
12
+ # return self.output
13
+
14
+
15
+
16
+ import ocrmypdf
17
+ from src.Bot.logger import logging
18
+ import os
19
+
20
+ class OCR:
21
+ def __init__(self, input, output=None):
22
+ self.input = input
23
+ # Set default output path if none is provided
24
+ if output is None:
25
+ default_output_dir = os.path.join(os.getcwd(), "output") # Default directory for output files
26
+ os.makedirs(default_output_dir, exist_ok=True) # Create the directory if it doesn't exist
27
+ self.output = os.path.join(default_output_dir, "output.pdf") # Default output file path
28
+ else:
29
+ self.output = output
30
+
31
+ def do_ocr(self):
32
+ ocrmypdf.ocr(self.input, output_file=self.output,force_ocr=True,)
33
+ return self.output
34
+
35
+
36
+ # Function to reset the FAISS index (clear vectors)
37
+ def reset_faiss_index(vector_store):
38
+ """Clear all vectors from the FAISS index."""
39
+ if isinstance(vector_store.index, faiss.Index):
40
+ vector_store.index.reset()
41
+ print("FAISS index has been reset (vectors cleared).")
42
+ else:
43
+ print("No FAISS index found.")
44
+
45
+ # Function to delete the FAISS index (remove from memory)
46
+ def delete_faiss_index(vector_store):
47
+ """Delete the FAISS index and free up memory."""
48
+ if isinstance(vector_store.index, faiss.Index):
49
+ del vector_store.index
50
+ vector_store.index = None # Set to None to avoid further access
51
+ gc.collect() # Ensure memory is freed
52
+ print("FAISS index deleted and memory cleared.")
53
+ else:
54
+ print("No FAISS index found.")
55
+
56
+
57
+
58
+
src/Bot/utils/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (1.56 kB). View file