File size: 5,006 Bytes
93762d1
 
 
 
 
 
 
28f535f
93762d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28f535f
 
 
 
 
 
93762d1
28f535f
93762d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import gradio as gr
from rag import RAG, ServiceContextModule
from llama_index.core import set_global_service_context
import json
from prompts import general_prompt
from gradio_pdf import PDF
import requests
import os

service_context_module = None
current_model = None


def initialize(api_key, model_name):
    global service_context_module, current_model
    gr.Info("Initializing app")
    url = "https://api.groq.com/openai/v1/models"
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json",
    }
    try:
        response = requests.get(url, headers=headers)
        data = response.json()
        models = [model["id"] for model in data["data"]]

    except Exception:
        gr.Error("Invalid API KEY")
        return gr.update(choices=[])

    if not service_context_module or current_model != model_name:
        service_context_module = ServiceContextModule(api_key, model_name)
        current_model = model_name
        gr.Info("App started")
        set_global_service_context(
            service_context=service_context_module.service_context
        )
    else:
        gr.Info("App is already running")

    return gr.update(choices=models)


def process_document(file, query):
    if file.endswith(".pdf"):
        return process_pdf(file, query=query)
    else:
        return "Unsupported file format"


def postprocess_json_string(json_string: str) -> dict:
    json_string = json_string.replace("'", '"')
    json_string = json_string[json_string.rfind("{") : json_string.rfind("}") + 1]
    try:
        json_data = json.loads(json_string)
    except Exception as e:
        print("Error parsing output, invalid json format", e)
    return json_data


def process_pdf(file, query):
    rag_module = RAG(filepaths=[file])
    fields = [field for field in query.split(",")]
    formatted_prompt = general_prompt(fields=fields)
    response = rag_module.run_query_engine(prompt=formatted_prompt)
    extracted_json = postprocess_json_string(json_string=response)
    return extracted_json


with gr.Blocks(title="Document Information Extractor.") as app:
    gr.Markdown(
        value="""
# Welcome to Document Information Extractor.
Created by [@rajsinghparihar](https://huggingface.co/rajsinghparihar) for extracting useful information from pdf documents like invoices, salary slips, etc.
## Usage:
- In the Init Section, Enter your `GROQ_API_KEY` in the corresponding labeled textbox.
- choose the model from the list of available models.
- click `Initialize` to start the app.

- In the app section, you can upload a document (pdf files: currently works for readable pdfs only, will add ocr functionality later)
- Enter the entities you wanna extract as a comma seperated string. (check the examples for more info)
- Click Submit to see the extracted entities as a JSON object.
"""
    )
    with gr.Tab(label="Init Section") as init_tab:
        with gr.Row():
            api_key = gr.Text(
                label="Enter your Groq API KEY",
                type="password",
            )
            if api_key == "" or not api_key:
                api_key = os.getenv("GROQ_API_KEY")
            available_models = gr.Dropdown(
                value="llama3-70b-8192",
                label="Choose your LLM",
                choices=[
                    "gemma-7b-it",
                    "llama3-70b-8192",
                    "llama3-8b-8192",
                    "mixtral-8x7b-32768",
                    "whisper-large-v3",
                ],
            )
        init_btn = gr.Button(value="Initialize")
        init_btn.click(
            fn=initialize,
            inputs=[api_key, available_models],
            outputs=available_models,
        )
    with gr.Tab(label="App Section") as app_tab:
        iface = gr.Interface(
            fn=process_document,
            inputs=[
                PDF(label="Document"),
                gr.Text(
                    label="Entities you wanna extract in comma separated string format"
                ),
            ],
            outputs=gr.JSON(label="Extracted Entities"),
            description="Upload a PDF document and extract specified entities from it.",
            examples=[
                [
                    "examples/Commerce Bank Statement Sample.pdf",
                    "Customer Name, Account Number, Statement Date, Ending Balance, Total Deposits, Checks Paid",
                ],
                [
                    "examples/Salary-Slip-pdf.pdf",
                    "Employee Name, Bank Name, Location, Total Salary, Total Deductions",
                ],
            ],
        )
    gr.Markdown("""
## Pros of LLMs as information extractors over current extraction solutions:
- LLMs are able to understand the scope of the problem from the context and are more robust to typos or extraction failure

## Cons
- Higher Inference Cost
- Can't use free APIs for Sensitive documents.
""")

app.launch(server_name="0.0.0.0", server_port=7860)