Update app.py
Browse files
app.py
CHANGED
@@ -1,222 +1,47 @@
|
|
1 |
-
import os
|
2 |
-
import re
|
3 |
-
import json
|
4 |
-
import io
|
5 |
-
import pandas as pd
|
6 |
-
from typing import List, Optional
|
7 |
-
from pydantic import BaseModel,Field
|
8 |
-
from openai import OpenAI
|
9 |
import gradio as gr
|
|
|
10 |
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
from openpyxl import load_workbook
|
15 |
-
from PIL import Image
|
16 |
-
import pytesseract
|
17 |
-
from striprtf.striprtf import rtf_to_text
|
18 |
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
endpoint = "https://invoice-processing3141.cognitiveservices.azure.com/"
|
24 |
-
credential = AzureKeyCredential(os.environ.get("AZURE_APT_KEY"))
|
25 |
-
document_intelligence_client = DocumentIntelligenceClient(endpoint, credential)
|
26 |
-
|
27 |
-
|
28 |
-
# Pydantic Models
|
29 |
-
class InvoiceItem(BaseModel):
|
30 |
-
material: str = Field(description="The main product item in the row for which the details are being extracted.")
|
31 |
-
Packing: str = Field(description="The packing size of the product. This is usually the quantity present in one product")
|
32 |
-
quantity: str = Field(description="The number of products mentioned in the invoice")
|
33 |
-
|
34 |
-
class Invoice(BaseModel):
|
35 |
-
po_date: str = Field(description="PO Date is usually the date on which the order was processed.")
|
36 |
-
order_value: float = Field(description="The total order value present at the end of the invoice product details.")
|
37 |
-
company_name: str = Field(description="The company sending in the invoice details.")
|
38 |
-
items: List[InvoiceItem] = Field(description="List of all Invoice Items present in the document.")
|
39 |
-
|
40 |
-
# OpenAI Client Setup
|
41 |
-
client = OpenAI(
|
42 |
-
base_url="https://api.studio.nebius.ai/v1/",
|
43 |
-
api_key=os.environ.get("NEBIUS_API_KEY")
|
44 |
-
)
|
45 |
-
|
46 |
-
# File Processing Functions
|
47 |
-
def process_pdf(file_path: str) -> str:
|
48 |
-
text = ''
|
49 |
-
with open(file_path, 'rb') as file:
|
50 |
-
reader = PyPDF2.PdfReader(file)
|
51 |
-
for page in reader.pages:
|
52 |
-
text += page.extract_text()
|
53 |
-
|
54 |
-
if not text.strip():
|
55 |
-
for page in reader.pages:
|
56 |
-
if '/XObject' in page['/Resources']:
|
57 |
-
for obj in page['/Resources']['/XObject'].get_object():
|
58 |
-
if page['/Resources']['/XObject'][obj]['/Subtype'] == '/Image':
|
59 |
-
xObject = page['/Resources']['/XObject'][obj]
|
60 |
-
data = xObject.get_data()
|
61 |
-
image = Image.open(io.BytesIO(data))
|
62 |
-
text += pytesseract.image_to_string(image)
|
63 |
-
return text
|
64 |
-
|
65 |
-
def process_doc(file_path: str) -> str:
|
66 |
-
if file_path.lower().endswith('.doc'):
|
67 |
-
with open(file_path, "r", encoding='utf-8', errors='ignore') as file:
|
68 |
-
return rtf_to_text(file.read())
|
69 |
-
else:
|
70 |
-
doc = Document(file_path)
|
71 |
-
return '\n'.join([paragraph.text for paragraph in doc.paragraphs])
|
72 |
-
|
73 |
-
def process_xlsx(file_path: str) -> str:
|
74 |
-
wb = load_workbook(file_path)
|
75 |
-
text = ''
|
76 |
-
for sheet in wb.sheetnames:
|
77 |
-
ws = wb[sheet]
|
78 |
-
for row in ws.iter_rows(values_only=True):
|
79 |
-
text += ' '.join(str(cell) for cell in row if cell is not None) + '\n'
|
80 |
-
return text
|
81 |
-
|
82 |
-
def process_txt(file_path: str) -> str:
|
83 |
-
with open(file_path, 'r') as file:
|
84 |
-
return file.read()
|
85 |
-
|
86 |
-
def process_azure(file_path: str) -> str:
|
87 |
-
with open(file_path, "rb") as f:
|
88 |
-
poller = document_intelligence_client.begin_analyze_document("prebuilt-layout", body=f, output_content_format=DocumentContentFormat.MARKDOWN)
|
89 |
-
result: AnalyzeResult = poller.result()
|
90 |
-
return result.content
|
91 |
-
|
92 |
-
def process_invoice_txt(file_path: str) -> str:
|
93 |
-
_, file_extension = os.path.splitext(file_path)
|
94 |
-
|
95 |
-
if file_extension.lower() == '.pdf':
|
96 |
-
return process_azure(file_path) #process_pdf(file_path)
|
97 |
-
elif file_extension.lower() in ['.doc']:
|
98 |
-
return process_doc(file_path)
|
99 |
-
elif file_extension.lower() in ['.docx']:
|
100 |
-
return process_azure(file_path)
|
101 |
-
elif file_extension.lower() in ['.xls', '.xlsx']:
|
102 |
-
return process_xlsx(file_path)
|
103 |
-
elif file_extension.lower() == '.txt':
|
104 |
-
return process_txt(file_path)
|
105 |
-
else:
|
106 |
-
raise ValueError(f"Unsupported file format: {file_extension}")
|
107 |
-
|
108 |
-
# Invoice Processing Functions
|
109 |
-
def process_invoice(invoice_data: str) -> str:
|
110 |
-
"""Process invoice data using OpenAI API with guided JSON."""
|
111 |
-
system_prompt = """
|
112 |
-
You are an expert invoice data extraction assistant. Extract key information from invoices
|
113 |
-
and return it in the specified JSON format. Handle variations in invoice layouts and infer
|
114 |
-
information when not explicitly stated.
|
115 |
-
"""
|
116 |
-
|
117 |
-
user_prompt = f"""
|
118 |
-
You are an expert at extracting invoice details from documents. Follow the below instructions carefully and return the items in a structured schema.
|
119 |
-
|
120 |
-
1. Extract the PO Date from the document. This is usually present before the product details.
|
121 |
-
2. Extract the total order value from the document. This would be the total value present at the end of the product details table.
|
122 |
-
3. Extract the name of the company sending in the order processing details.
|
123 |
-
4. Extract the details of the invoice. The invoice items might vary for each document but look for the main indicators needed for processing.
|
124 |
-
- Material: This is usually the item name or the product name in the table.
|
125 |
-
- Packing: This refers to the packing size of the product. The packing size can be in grams(GM), liters(ML) and capsules per pack ('S') for example 15'S is the packing size for a medicinal product.
|
126 |
-
- Quantity: This is the number of the material/product mentioned in the document.
|
127 |
-
|
128 |
-
Ensure that these instructions are followed and the data is extracted verbatim. Do not add your own comments or make up sentences.
|
129 |
-
|
130 |
-
Invoice:
|
131 |
-
{invoice_data}
|
132 |
-
"""
|
133 |
-
|
134 |
-
try:
|
135 |
-
completion = client.chat.completions.create(
|
136 |
-
model="meta-llama/Meta-Llama-3.1-8B-Instruct-fast",
|
137 |
-
messages=[
|
138 |
-
{"role": "system", "content": system_prompt},
|
139 |
-
{"role": "user", "content": user_prompt}
|
140 |
-
],
|
141 |
-
extra_body={"guided_json": Invoice.model_json_schema()}
|
142 |
-
)
|
143 |
-
|
144 |
-
if hasattr(completion.choices[0].message, 'refusal') and completion.choices[0].message.refusal:
|
145 |
-
raise ValueError(f"API refused to process: {completion.choices[0].message.refusal}")
|
146 |
-
|
147 |
-
return completion.choices[0].message.content.strip()
|
148 |
-
except Exception as e:
|
149 |
-
raise ValueError(f"Error processing with OpenAI API: {str(e)}")
|
150 |
-
|
151 |
-
def process_and_save_to_excel(file_path: str) -> tuple:
|
152 |
-
"""Process invoice file and save results to Excel."""
|
153 |
-
if not file_path:
|
154 |
-
raise ValueError("No file uploaded")
|
155 |
-
|
156 |
-
temp_dir = "temp"
|
157 |
-
os.makedirs(temp_dir, exist_ok=True)
|
158 |
-
|
159 |
-
try:
|
160 |
-
# Process directly from Gradio-provided temp file path
|
161 |
-
invoice_text = process_invoice_txt(file_path)
|
162 |
-
json_output = process_invoice(invoice_text)
|
163 |
-
data = Invoice.model_validate_json(json_output)
|
164 |
-
|
165 |
-
# Convert to DataFrame
|
166 |
-
rows = [{
|
167 |
-
'PO Date': data.po_date,
|
168 |
-
'Order Value': data.order_value,
|
169 |
-
'Company Name': data.company_name,
|
170 |
-
'Material': item.material,
|
171 |
-
'Packing': item.Packing,
|
172 |
-
'Quantity': item.quantity,
|
173 |
-
} for item in data.items]
|
174 |
-
|
175 |
-
df = pd.DataFrame(rows)
|
176 |
-
|
177 |
-
# Create output filename with timestamp
|
178 |
-
file_name = os.path.basename(file_path).split('.')[0]
|
179 |
-
output_filename = os.path.join(temp_dir, f"{file_name}.xlsx")
|
180 |
-
|
181 |
-
# Save to Excel
|
182 |
-
df.to_excel(output_filename, index=False, engine='openpyxl')
|
183 |
-
|
184 |
-
return output_filename, data.model_dump()
|
185 |
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
type="binary" # Important for proper file downloading
|
208 |
)
|
209 |
-
json_output = gr.JSON(label="Extracted Data Preview")
|
210 |
|
211 |
-
|
212 |
-
fn=process_and_save_to_excel,
|
213 |
-
inputs=[file_input],
|
214 |
-
outputs=[excel_output, json_output]
|
215 |
-
)
|
216 |
-
|
217 |
-
return app
|
218 |
|
219 |
-
|
220 |
-
|
|
|
|
|
221 |
app.launch(debug=True)
|
222 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
+
from src.invoice_processor import InvoiceProcessor
|
3 |
|
4 |
+
class InvoiceUI:
|
5 |
+
def __init__(self):
|
6 |
+
self.processor = InvoiceProcessor()
|
|
|
|
|
|
|
|
|
7 |
|
8 |
+
def create_interface(self) -> gr.Blocks:
|
9 |
+
"""Create and return the Gradio interface."""
|
10 |
+
with gr.Blocks() as app:
|
11 |
+
gr.Markdown("# Invoice Data Processor")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
+
with gr.Row():
|
14 |
+
file_input = gr.File(
|
15 |
+
label="Upload Invoice File (PDF, DOC, XLSX, or TXT)",
|
16 |
+
file_types=[".pdf", ".doc", ".docx", ".xlsx", ".txt"]
|
17 |
+
)
|
18 |
+
|
19 |
+
with gr.Row():
|
20 |
+
process_button = gr.Button("Process Invoice")
|
21 |
+
|
22 |
+
with gr.Row():
|
23 |
+
excel_output = gr.File(
|
24 |
+
label="Download Excel File",
|
25 |
+
file_types=[".xlsx"],
|
26 |
+
type="binary"
|
27 |
+
)
|
28 |
+
json_output = gr.JSON(label="Extracted Data Preview")
|
29 |
+
|
30 |
+
process_button.click(
|
31 |
+
fn=self.processor.process_and_save_to_excel,
|
32 |
+
inputs=[file_input],
|
33 |
+
outputs=[excel_output, json_output]
|
|
|
34 |
)
|
|
|
35 |
|
36 |
+
return app
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
+
def main():
|
39 |
+
"""Initialize and launch the Gradio interface"""
|
40 |
+
ui = InvoiceUI()
|
41 |
+
app = ui.create_interface()
|
42 |
app.launch(debug=True)
|
43 |
|
44 |
+
if __name__ == "__main__":
|
45 |
+
main()
|
46 |
+
|
47 |
+
|