Spaces:

rajsinghparihar
/

Document-Information-Extraction

Running

App Files Files Community

rajsinghparihar commited on Jun 23, 2024

Commit

93762d1

1 Parent(s): 862ba62

first commit: doc-info-ext v0.0.1

Browse files

Files changed (8) hide show

.gitignore +168 -0
README.md +6 -6
app.py +136 -0
examples/Commerce Bank Statement Sample.pdf +0 -0
examples/Salary-Slip-pdf.pdf +0 -0
prompts.py +17 -0
rag.py +57 -0
requirements.txt +7 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,168 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+*.zip
+*.xlsx
+*.png
+*.ipynb
+*.DS_Store
+*.db
+*.tar

README.md CHANGED Viewed

@@ -1,12 +1,12 @@
 ---
-title: Document Information Extraction
 emoji: 🔥
-colorFrom: gray
-colorTo: blue
-sdk: streamlit
-sdk_version: 1.35.0
 app_file: app.py
-pinned: false
 license: apache-2.0
 ---

 ---
+title: Document Information Extractor
 emoji: 🔥
+colorFrom: purple
+colorTo: purple
+sdk: gradio
+sdk_version: 3.3.1
 app_file: app.py
+pinned: true
 license: apache-2.0
 ---

app.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import gradio as gr
+from rag import RAG, ServiceContextModule
+from llama_index.core import set_global_service_context
+from dotenv import load_dotenv
+import json
+from prompts import general_prompt
+from gradio_pdf import PDF
+import requests
+service_context_module = None
+current_model = None
+def initialize(api_key, model_name):
+    global service_context_module, current_model
+    gr.Info("Initializing app")
+    load_dotenv(override=True)
+    url = "https://api.groq.com/openai/v1/models"
+    headers = {
+        "Authorization": f"Bearer {api_key}",
+        "Content-Type": "application/json",
+    }
+    try:
+        response = requests.get(url, headers=headers)
+        data = response.json()
+        models = [model["id"] for model in data["data"]]
+    except Exception:
+        gr.Error("Invalid API KEY")
+        return gr.update(choices=[])
+    if not service_context_module or current_model != model_name:
+        service_context_module = ServiceContextModule(api_key, model_name)
+        current_model = model_name
+        gr.Info("App started")
+        set_global_service_context(
+            service_context=service_context_module.service_context
+        )
+    else:
+        gr.Info("App is already running")
+    return gr.update(choices=models)
+def process_document(file, query):
+    if file.endswith(".pdf"):
+        return process_pdf(file, query=query)
+    else:
+        return "Unsupported file format"
+def postprocess_json_string(json_string: str) -> dict:
+    json_string = json_string.replace("'", '"')
+    json_string = json_string[json_string.rfind("{") : json_string.rfind("}") + 1]
+    try:
+        json_data = json.loads(json_string)
+    except Exception as e:
+        print("Error parsing output, invalid json format", e)
+    return json_data
+def process_pdf(file, query):
+    rag_module = RAG(filepaths=[file])
+    fields = [field for field in query.split(",")]
+    formatted_prompt = general_prompt(fields=fields)
+    response = rag_module.run_query_engine(prompt=formatted_prompt)
+    extracted_json = postprocess_json_string(json_string=response)
+    return extracted_json
+with gr.Blocks(title="Document Information Extractor.") as app:
+    gr.Markdown(
+        value="""
+# Welcome to Document Information Extractor.
+Created by [@rajsinghparihar](https://huggingface.co/rajsinghparihar) for extracting useful information from pdf documents like invoices, salary slips, etc.
+## Usage:
+- In the Init Section, Enter your `GROQ_API_KEY` in the corresponding labeled textbox.
+- choose the model from the list of available models.
+- click `Initialize` to start the app.
+- In the app section, you can upload a document (pdf files: currently works for readable pdfs only, will add ocr functionality later)
+- Enter the entities you wanna extract as a comma seperated string. (check the examples for more info)
+- Click Submit to see the extracted entities as a JSON object.
+"""
+    )
+    with gr.Tab(label="Init Section") as init_tab:
+        with gr.Row():
+            api_key = gr.Text(label="Enter your Groq API KEY", type="password")
+            available_models = gr.Dropdown(
+                label="Choose your LLM",
+                choices=[
+                    "gemma-7b-it",
+                    "llama3-70b-8192",
+                    "llama3-8b-8192",
+                    "mixtral-8x7b-32768",
+                    "whisper-large-v3",
+                ],
+            )
+        init_btn = gr.Button(value="Initialize")
+        init_btn.click(
+            fn=initialize,
+            inputs=[api_key, available_models],
+            outputs=available_models,
+        )
+    with gr.Tab(label="App Section") as app_tab:
+        iface = gr.Interface(
+            fn=process_document,
+            inputs=[
+                PDF(label="Document"),
+                gr.Text(
+                    label="Entities you wanna extract in comma separated string format"
+                ),
+            ],
+            outputs=gr.JSON(label="Extracted Entities"),
+            description="Upload a PDF document and extract specified entities from it.",
+            examples=[
+                [
+                    "examples/Commerce Bank Statement Sample.pdf",
+                    "Customer Name, Account Number, Statement Date, Ending Balance, Total Deposits, Checks Paid",
+                ],
+                [
+                    "examples/Salary-Slip-pdf.pdf",
+                    "Employee Name, Bank Name, Location, Total Salary, Total Deductions",
+                ],
+            ],
+        )
+    gr.Markdown("""
+## Pros of LLMs as information extractors over current extraction solutions:
+- LLMs are able to understand the scope of the problem from the context and are more robust to typos or extraction failure
+## Cons
+- Higher Inference Cost
+- Can't use free APIs for Sensitive documents.
+""")
+app.launch(server_name="0.0.0.0", server_port=7860)

examples/Commerce Bank Statement Sample.pdf ADDED Viewed

Binary file (55.1 kB). View file

examples/Salary-Slip-pdf.pdf ADDED Viewed

Binary file (38.3 kB). View file

prompts.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import outlines
+@outlines.prompt
+def general_prompt(fields):
+    """
+    You are an entity extractor.
+    Using the information in the provided documents, use your deep understanding of documents and complete the following tasks.
+    1. Answer the question, What are the values of the following, {{ fields }}?
+    2. Print the answers against each field in a step by step approach.
+    3. After you have all the answers ready, Please format the response in JSON format, with these fields as keys and their answers as values.
+    Make sure to follow the Instructions below.
+    1. In the records, make sure to only include the values of the descriptors without any descriptor names.
+    2. Do NOT Create a Nested JSON response. If response is Nested, format it to a simpler JSON format.
+    2. Avoid keywords like <<SYS>> or [SYS] or [INST] in the final response.
+    """

rag.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from llama_index.core import (
+    VectorStoreIndex,
+    SimpleDirectoryReader,
+    get_response_synthesizer,
+    ServiceContext,
+)
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from llama_index.core.postprocessor import SentenceTransformerRerank
+from typing import Optional, List
+from llama_index.llms.groq import Groq
+class RAG:
+    def __init__(
+        self, filepaths: List[str], rerank: Optional[SentenceTransformerRerank] = None
+    ) -> None:
+        documents = SimpleDirectoryReader(input_files=filepaths).load_data()
+        response_synthesizer = get_response_synthesizer(
+            response_mode="tree_summarize",
+            use_async=True,
+        )
+        self.index = VectorStoreIndex.from_documents(
+            documents=documents,
+            response_synthesizer=response_synthesizer,
+        )
+        if not rerank:
+            self.query_engine = self.index.as_query_engine(
+                response_mode="tree_summarize",
+                use_async=True,
+                streaming=True,
+                similarity_top_k=10,
+            )
+        else:
+            self.query_engine = self.index.as_query_engine(
+                response_mode="tree_summarize",
+                use_async=True,
+                streaming=True,
+                similarity_top_k=10,
+                node_postprocessors=[rerank],
+            )
+    def run_query_engine(self, prompt):
+        response = self.query_engine.query(prompt)
+        response.print_response_stream()
+        return str(response)
+class ServiceContextModule:
+    def __init__(self, api_key, model_name) -> None:
+        self._llm = Groq(model=model_name, api_key=api_key)
+        self._embedding_model = HuggingFaceEmbedding(
+            "Snowflake/snowflake-arctic-embed-m-long", trust_remote_code=True
+        )
+        self.service_context = ServiceContext.from_defaults(
+            llm=self._llm,
+            embed_model=self._embedding_model,
+        )

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio
+llama-index
+llama-index-llms-groq
+llama-index-embeddings-huggingface
+einops
+outlines
+gradio_pdf