Spaces:

sachinsen1295
/

Data_extracter

Sleeping

App Files Files Community

sachinsen1295 commited on Sep 13, 2024

Commit

4b0118c

verified ·

1 Parent(s): 3cbcfe2

Upload 13 files

Browse files

Files changed (13) hide show

LICENSE +21 -0
README.md +0 -12
app.py +169 -0
pyproject.toml +12 -0
requirements.txt +28 -0
setup.py +34 -0
src/Bot/__init__.py +0 -0
src/Bot/__pycache__/__init__.cpython-310.pyc +0 -0
src/Bot/__pycache__/logger.cpython-310.pyc +0 -0
src/Bot/exception.py +21 -0
src/Bot/logger.py +18 -0
src/Bot/utils/__init__.py +58 -0
src/Bot/utils/__pycache__/__init__.cpython-310.pyc +0 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Sachin Sen
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,12 +0,0 @@
----
-title: Data Extracter
-emoji: 🦀
-colorFrom: pink
-colorTo: blue
-sdk: streamlit
-sdk_version: 1.38.0
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import streamlit as st
+from src.Bot.utils import OCR
+import time
+import os
+import gc
+from langchain_community.docstore.in_memory import InMemoryDocstore
+from langchain_community.vectorstores import FAISS
+import shutil
+from dotenv import load_dotenv
+from langchain.chains.combine_documents import create_stuff_documents_chain
+from langchain.retrievers import SelfQueryRetriever
+from langchain_groq import ChatGroq
+from langchain_core.runnables.history import RunnableWithMessageHistory
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_chroma import Chroma
+from langchain.chains.retrieval import create_retrieval_chain
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import PyMuPDFLoader
+from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
+from langchain.chains import create_history_aware_retriever, create_retrieval_chain
+from langchain_community.chat_message_histories import ChatMessageHistory
+from langchain_core.chat_history import BaseChatMessageHistory
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+st.title("Conversational RAG with PDF uploads, OCR, and Chat History")
+st.write("Upload PDFs, perform OCR, and chat with their content")
+st.header("Enter API Keys")
+if "groq_api_key" not in st.session_state:
+    st.session_state["groq_api_key"] = None
+if "hf_token" not in st.session_state:
+    st.session_state["hf_token"] = None
+if "pdf" not in st.session_state:
+    st.session_state["pdf"] = False
+if "chat_button" not in st.session_state:
+    st.session_state["chat_button"] = False
+if "default_question" not in st.session_state:
+    st.session_state["default_question"] = False
+if "vectorstore" not in st.session_state:
+    st.session_state["vectorstore"] = None
+# Input for GROQ API and Hugging Face API
+groq_api_key = st.text_input("Enter your GROQ API Key", type="password")
+hf_token = st.text_input("Enter your Hugging Face API Key", type="password")
+if st.button("Submit API Keys"):
+    st.session_state["groq_api_key"] = groq_api_key
+    st.session_state["hf_token"] = hf_token
+    st.success("API keys submitted successfully!")
+if st.session_state["groq_api_key"] and st.session_state["hf_token"]:
+    os.environ["GROQ_API_KEY"] = st.session_state["groq_api_key"]
+    os.environ['HF_TOKEN'] = st.session_state["hf_token"]
+    llm = ChatGroq(groq_api_key=st.session_state["groq_api_key"], model_name="Gemma2-9b-It")
+    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
+    st.write("API Keys are set. You can now upload a PDF and start working.")
+    file_upload = st.sidebar.file_uploader("Upload your PDF", type="pdf")
+    if file_upload:
+        input_pdf_path = os.path.join(os.getcwd(), "uploaded_file.pdf")
+        with open(input_pdf_path, "wb") as f:
+            f.write(file_upload.getvalue())
+    ocr_button = st.sidebar.button("OCR")
+    if ocr_button:
+        ocr = OCR(input_pdf_path)
+        output_file_path = ocr.do_ocr()
+        st.session_state.pdf = True
+        st.write(output_file_path)
+        # Clear existing Chroma DB instance
+        #st.session_state.vectorstore = None
+    chat_button = st.sidebar.button("Chat")
+    if chat_button:
+        st.session_state.chat_button = True
+    clear_history = st.sidebar.button("Clear History")
+    if clear_history:
+        st.session_state.vectorstore = None  # Ensure Chroma DB is cleared
+        st.session_state["pdf"] = False
+        st.session_state["chat_button"] = False
+        st.session_state["default_question"] = False
+        st.session_state["vectorstore"] = None
+        st.write("History cleared and Chroma DB removed from memory.")
+    if st.session_state.pdf and st.session_state.chat_button:
+        default_output_dir = os.path.join(os.getcwd(), "output")
+        os.makedirs(default_output_dir, exist_ok=True)
+        output = os.path.join(default_output_dir, "output.pdf")
+        loader = PyMuPDFLoader(output)
+        documents = loader.load()
+        text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=500)
+        splits = text_splitter.split_documents(documents)
+        # Create a new Chroma instance
+        st.session_state.vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)
+        retriever = st.session_state.vectorstore.as_retriever()
+        system_prompt = (
+            """You are an intelligent assistant tasked with extracting specific details from a document. I will list the fields I need information on, and you should provide the answers based on the document content. Please extract and format the answers clearly for each of the following fields:
+            Please provide the answers in the same order, clearly labeling each field.
+                "\n\n"
+                "{context}"
+            """
+        )
+        prompt = ChatPromptTemplate.from_messages(
+            [
+                ("system", system_prompt),
+                ("human", "{input}"),
+            ]
+        )
+        question_answer_chain = create_stuff_documents_chain(llm, prompt)
+        rag_chain = create_retrieval_chain(retriever, question_answer_chain)
+        if "default_question" not in st.session_state:
+            st.session_state.default_question = False
+        default_questions = st.button("Ask Default Questions to PDF")
+        question = ["who is second party give its details",
+                    "who is first party", "carpet Area and builtup area", "Rent escalation details",
+                    "Transaction Type (sale/ Lease)", "Registry Date", "Document or registration Number",
+                    "Village name", "Transaction Based on (Builtup or Carpet if both are given consider Builtup)",
+                    "Stamp Duty if Given", "Total Car Parking", "Refund of Interest Fee",
+                    "escalation chart with start date, end date, rent per sqft after escalation percentage by calculation",
+                    "Car parking Charges", "Cam charges per Square feet", "rent per in 1st year",
+                    "rent value", "Lease start date", "Lease End date calculate by start date if not given",
+                    "lock in period in months if given if not give 'NA'", "Notice Period in days or months",
+                    "Location and Floor of area leased to second party", "security Deposit amount"]
+        if default_questions:
+            resp = []
+            for i in question:
+                response = rag_chain.invoke({"input": i})
+                if 'answer' in response:
+                    resp.append(f"{i} : {response['answer']}")
+                else:
+                    resp.append(f"{i} : No answer found")
+                time.sleep(1)
+            st.write("Default questions are selected")
+            st.write(resp)
+        user_input = st.text_input("Your question:")
+        if user_input:
+            response = rag_chain.invoke({"input": user_input})
+            st.write("Assistant:", response['answer'])
+    else:
+        st.write("<--- Run OCR FIRST")
+else:
+    st.warning("Please enter your API keys to proceed.")

pyproject.toml ADDED Viewed

	@@ -0,0 +1,12 @@

+[build-system]
+requires = ['setuptools>=42.0', "wheel"]
+build-backend = "setuptools.build_meta"
+[tool.pytest.ini_options]
+testpaths = [
+    "tests"
+    ]
+[tool.mypy]
+mypy_path = "src"
+ignore_missing_imports = true

requirements.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+ipykernel
+Langchain
+python-dotenv
+langchain-community
+langchain_groq
+#langchain-google
+bs4
+langchain-core
+faiss-cpu
+sentence-transformers
+#PyPDF
+fastapi
+uvicorn
+langserve
+langchain-chroma
+langchain-huggingface
+wikipedia
+arxiv
+duckduckgo-search
+nltk
+unstructured
+youtube-transcript-api
+langserve
+tesseract
+pytesseract
+streamlit
+ocrmypdf
+gs

setup.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import setuptools
+_version_ = '0.0.1'
+REPO_NAME = 'BOT'
+AUTHOR_NAME = 'Sachinsen1295'
+SOURCE_REPO = "BOT"
+AUTHOR_EMAIL = "[email protected]"
+with open("README.md", "r", encoding="utf-8") as f:
+    LONG_DESCRIPTION=f.read()
+with open("LICENSE", 'r') as L:
+    LICENSE = L.read()
+setuptools.setup(
+                 name=SOURCE_REPO,
+                 version=_version_,
+                 author=AUTHOR_NAME,
+                 author_email=AUTHOR_EMAIL,
+                 description="This is my Deed extraction Bot",
+                 long_description=LONG_DESCRIPTION,
+                 long_description_content = "text/markdown",
+                 url=f"https://github.com/{AUTHOR_NAME}/{REPO_NAME}",
+                 license = LICENSE,
+                  project_urls={
+                    "Bug Tracker": f"https://github.com/{AUTHOR_NAME}/{REPO_NAME}/issues",
+                     },
+                      package_dir={"":"src"},
+                      packages=setuptools.find_packages(where="src")
+                 )

src/Bot/__init__.py ADDED Viewed

File without changes

src/Bot/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (175 Bytes). View file

src/Bot/__pycache__/logger.cpython-310.pyc ADDED Viewed

Binary file (578 Bytes). View file

src/Bot/exception.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import sys
+import os
+from Bot.logger import logger
+def error_message_detail(error,error_detail:sys):
+    _,_,exc_tb=error_detail.exc_info()
+    file_name=exc_tb.tb_frame.f_code.co_filename
+    error_message="Error occured in python script name [{0}] line number [{1}] error message[{2}]".format(
+     file_name,exc_tb.tb_lineno,str(error))
+    return error_message
+class CustomException(Exception):
+    def __init__(self,error_message,error_detail:sys):
+        super().__init__(error_message)
+        self.error_message=error_message_detail(error_message,error_detail=error_detail)
+    def __str__(self):
+        return self.error_message

src/Bot/logger.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import logging
+import os,sys
+logging_str="[%(asctime)s: %(levelname)s: %(module)s]: %(message)s"
+log_dir = "logs"
+log_file_path = os.path.join(log_dir,"running_logs.log")
+os.makedirs(log_dir, exist_ok=True)
+logging.basicConfig(level=logging.INFO, format=logging_str,
+                    handlers=[
+                        logging.FileHandler(log_file_path),
+                        logging.StreamHandler(sys.stdout)  # to display the logger messages in command prompt
+                    ])
+logger = logging.getLogger("Bot")

src/Bot/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# import ocrmypdf
+# from src.Bot.logger import logging
+# class OCR:
+#     def __init__(self,input, output):
+#         self.input = input
+#         self.output = output
+#     def do_ocr(self):
+#         ocrmypdf.ocr(self.input, output_file=self.output)
+#         return self.output
+import ocrmypdf
+from src.Bot.logger import logging
+import os
+class OCR:
+    def __init__(self, input, output=None):
+        self.input = input
+        # Set default output path if none is provided
+        if output is None:
+            default_output_dir = os.path.join(os.getcwd(), "output")  # Default directory for output files
+            os.makedirs(default_output_dir, exist_ok=True)  # Create the directory if it doesn't exist
+            self.output = os.path.join(default_output_dir, "output.pdf")  # Default output file path
+        else:
+            self.output = output
+    def do_ocr(self):
+        ocrmypdf.ocr(self.input, output_file=self.output,force_ocr=True,)
+        return self.output
+    # Function to reset the FAISS index (clear vectors)
+    def reset_faiss_index(vector_store):
+        """Clear all vectors from the FAISS index."""
+        if isinstance(vector_store.index, faiss.Index):
+            vector_store.index.reset()
+            print("FAISS index has been reset (vectors cleared).")
+        else:
+            print("No FAISS index found.")
+    # Function to delete the FAISS index (remove from memory)
+    def delete_faiss_index(vector_store):
+        """Delete the FAISS index and free up memory."""
+        if isinstance(vector_store.index, faiss.Index):
+            del vector_store.index
+            vector_store.index = None  # Set to None to avoid further access
+            gc.collect()  # Ensure memory is freed
+            print("FAISS index deleted and memory cleared.")
+        else:
+            print("No FAISS index found.")

src/Bot/utils/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (1.56 kB). View file