krishnavadithya commited on
Commit
08a82df
·
verified ·
1 Parent(s): cff37c5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -213
app.py CHANGED
@@ -1,222 +1,47 @@
1
- import os
2
- import re
3
- import json
4
- import io
5
- import pandas as pd
6
- from typing import List, Optional
7
- from pydantic import BaseModel,Field
8
- from openai import OpenAI
9
  import gradio as gr
 
10
 
11
- # File processing imports
12
- import PyPDF2
13
- from docx import Document
14
- from openpyxl import load_workbook
15
- from PIL import Image
16
- import pytesseract
17
- from striprtf.striprtf import rtf_to_text
18
 
19
- from azure.core.credentials import AzureKeyCredential
20
- from azure.ai.documentintelligence import DocumentIntelligenceClient
21
- from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, DocumentContentFormat, AnalyzeResult
22
-
23
- endpoint = "https://invoice-processing3141.cognitiveservices.azure.com/"
24
- credential = AzureKeyCredential(os.environ.get("AZURE_APT_KEY"))
25
- document_intelligence_client = DocumentIntelligenceClient(endpoint, credential)
26
-
27
-
28
- # Pydantic Models
29
- class InvoiceItem(BaseModel):
30
- material: str = Field(description="The main product item in the row for which the details are being extracted.")
31
- Packing: str = Field(description="The packing size of the product. This is usually the quantity present in one product")
32
- quantity: str = Field(description="The number of products mentioned in the invoice")
33
-
34
- class Invoice(BaseModel):
35
- po_date: str = Field(description="PO Date is usually the date on which the order was processed.")
36
- order_value: float = Field(description="The total order value present at the end of the invoice product details.")
37
- company_name: str = Field(description="The company sending in the invoice details.")
38
- items: List[InvoiceItem] = Field(description="List of all Invoice Items present in the document.")
39
-
40
- # OpenAI Client Setup
41
- client = OpenAI(
42
- base_url="https://api.studio.nebius.ai/v1/",
43
- api_key=os.environ.get("NEBIUS_API_KEY")
44
- )
45
-
46
- # File Processing Functions
47
- def process_pdf(file_path: str) -> str:
48
- text = ''
49
- with open(file_path, 'rb') as file:
50
- reader = PyPDF2.PdfReader(file)
51
- for page in reader.pages:
52
- text += page.extract_text()
53
-
54
- if not text.strip():
55
- for page in reader.pages:
56
- if '/XObject' in page['/Resources']:
57
- for obj in page['/Resources']['/XObject'].get_object():
58
- if page['/Resources']['/XObject'][obj]['/Subtype'] == '/Image':
59
- xObject = page['/Resources']['/XObject'][obj]
60
- data = xObject.get_data()
61
- image = Image.open(io.BytesIO(data))
62
- text += pytesseract.image_to_string(image)
63
- return text
64
-
65
- def process_doc(file_path: str) -> str:
66
- if file_path.lower().endswith('.doc'):
67
- with open(file_path, "r", encoding='utf-8', errors='ignore') as file:
68
- return rtf_to_text(file.read())
69
- else:
70
- doc = Document(file_path)
71
- return '\n'.join([paragraph.text for paragraph in doc.paragraphs])
72
-
73
- def process_xlsx(file_path: str) -> str:
74
- wb = load_workbook(file_path)
75
- text = ''
76
- for sheet in wb.sheetnames:
77
- ws = wb[sheet]
78
- for row in ws.iter_rows(values_only=True):
79
- text += ' '.join(str(cell) for cell in row if cell is not None) + '\n'
80
- return text
81
-
82
- def process_txt(file_path: str) -> str:
83
- with open(file_path, 'r') as file:
84
- return file.read()
85
-
86
- def process_azure(file_path: str) -> str:
87
- with open(file_path, "rb") as f:
88
- poller = document_intelligence_client.begin_analyze_document("prebuilt-layout", body=f, output_content_format=DocumentContentFormat.MARKDOWN)
89
- result: AnalyzeResult = poller.result()
90
- return result.content
91
-
92
- def process_invoice_txt(file_path: str) -> str:
93
- _, file_extension = os.path.splitext(file_path)
94
-
95
- if file_extension.lower() == '.pdf':
96
- return process_azure(file_path) #process_pdf(file_path)
97
- elif file_extension.lower() in ['.doc']:
98
- return process_doc(file_path)
99
- elif file_extension.lower() in ['.docx']:
100
- return process_azure(file_path)
101
- elif file_extension.lower() in ['.xls', '.xlsx']:
102
- return process_xlsx(file_path)
103
- elif file_extension.lower() == '.txt':
104
- return process_txt(file_path)
105
- else:
106
- raise ValueError(f"Unsupported file format: {file_extension}")
107
-
108
- # Invoice Processing Functions
109
- def process_invoice(invoice_data: str) -> str:
110
- """Process invoice data using OpenAI API with guided JSON."""
111
- system_prompt = """
112
- You are an expert invoice data extraction assistant. Extract key information from invoices
113
- and return it in the specified JSON format. Handle variations in invoice layouts and infer
114
- information when not explicitly stated.
115
- """
116
-
117
- user_prompt = f"""
118
- You are an expert at extracting invoice details from documents. Follow the below instructions carefully and return the items in a structured schema.
119
-
120
- 1. Extract the PO Date from the document. This is usually present before the product details.
121
- 2. Extract the total order value from the document. This would be the total value present at the end of the product details table.
122
- 3. Extract the name of the company sending in the order processing details.
123
- 4. Extract the details of the invoice. The invoice items might vary for each document but look for the main indicators needed for processing.
124
- - Material: This is usually the item name or the product name in the table.
125
- - Packing: This refers to the packing size of the product. The packing size can be in grams(GM), liters(ML) and capsules per pack ('S') for example 15'S is the packing size for a medicinal product.
126
- - Quantity: This is the number of the material/product mentioned in the document.
127
-
128
- Ensure that these instructions are followed and the data is extracted verbatim. Do not add your own comments or make up sentences.
129
-
130
- Invoice:
131
- {invoice_data}
132
- """
133
-
134
- try:
135
- completion = client.chat.completions.create(
136
- model="meta-llama/Meta-Llama-3.1-8B-Instruct-fast",
137
- messages=[
138
- {"role": "system", "content": system_prompt},
139
- {"role": "user", "content": user_prompt}
140
- ],
141
- extra_body={"guided_json": Invoice.model_json_schema()}
142
- )
143
-
144
- if hasattr(completion.choices[0].message, 'refusal') and completion.choices[0].message.refusal:
145
- raise ValueError(f"API refused to process: {completion.choices[0].message.refusal}")
146
-
147
- return completion.choices[0].message.content.strip()
148
- except Exception as e:
149
- raise ValueError(f"Error processing with OpenAI API: {str(e)}")
150
-
151
- def process_and_save_to_excel(file_path: str) -> tuple:
152
- """Process invoice file and save results to Excel."""
153
- if not file_path:
154
- raise ValueError("No file uploaded")
155
-
156
- temp_dir = "temp"
157
- os.makedirs(temp_dir, exist_ok=True)
158
-
159
- try:
160
- # Process directly from Gradio-provided temp file path
161
- invoice_text = process_invoice_txt(file_path)
162
- json_output = process_invoice(invoice_text)
163
- data = Invoice.model_validate_json(json_output)
164
-
165
- # Convert to DataFrame
166
- rows = [{
167
- 'PO Date': data.po_date,
168
- 'Order Value': data.order_value,
169
- 'Company Name': data.company_name,
170
- 'Material': item.material,
171
- 'Packing': item.Packing,
172
- 'Quantity': item.quantity,
173
- } for item in data.items]
174
-
175
- df = pd.DataFrame(rows)
176
-
177
- # Create output filename with timestamp
178
- file_name = os.path.basename(file_path).split('.')[0]
179
- output_filename = os.path.join(temp_dir, f"{file_name}.xlsx")
180
-
181
- # Save to Excel
182
- df.to_excel(output_filename, index=False, engine='openpyxl')
183
-
184
- return output_filename, data.model_dump()
185
 
186
- except Exception as e:
187
- raise ValueError(f"Failed to process invoice: {str(e)}")
188
-
189
- # Gradio Interface
190
- def create_gradio_app():
191
- with gr.Blocks() as app:
192
- gr.Markdown("# Invoice Data Processor")
193
-
194
- with gr.Row():
195
- file_input = gr.File(
196
- label="Upload Invoice File (PDF, DOC, XLSX, or TXT)",
197
- file_types=[".pdf", ".doc", ".docx", ".xlsx", ".txt"]
198
- )
199
-
200
- with gr.Row():
201
- process_button = gr.Button("Process Invoice")
202
-
203
- with gr.Row():
204
- excel_output = gr.File(
205
- label="Download Excel File",
206
- file_types=[".xlsx"],
207
- type="binary" # Important for proper file downloading
208
  )
209
- json_output = gr.JSON(label="Extracted Data Preview")
210
 
211
- process_button.click(
212
- fn=process_and_save_to_excel,
213
- inputs=[file_input],
214
- outputs=[excel_output, json_output]
215
- )
216
-
217
- return app
218
 
219
- if __name__ == "__main__":
220
- app = create_gradio_app()
 
 
221
  app.launch(debug=True)
222
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ from src.invoice_processor import InvoiceProcessor
3
 
4
+ class InvoiceUI:
5
+ def __init__(self):
6
+ self.processor = InvoiceProcessor()
 
 
 
 
7
 
8
+ def create_interface(self) -> gr.Blocks:
9
+ """Create and return the Gradio interface."""
10
+ with gr.Blocks() as app:
11
+ gr.Markdown("# Invoice Data Processor")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
+ with gr.Row():
14
+ file_input = gr.File(
15
+ label="Upload Invoice File (PDF, DOC, XLSX, or TXT)",
16
+ file_types=[".pdf", ".doc", ".docx", ".xlsx", ".txt"]
17
+ )
18
+
19
+ with gr.Row():
20
+ process_button = gr.Button("Process Invoice")
21
+
22
+ with gr.Row():
23
+ excel_output = gr.File(
24
+ label="Download Excel File",
25
+ file_types=[".xlsx"],
26
+ type="binary"
27
+ )
28
+ json_output = gr.JSON(label="Extracted Data Preview")
29
+
30
+ process_button.click(
31
+ fn=self.processor.process_and_save_to_excel,
32
+ inputs=[file_input],
33
+ outputs=[excel_output, json_output]
 
34
  )
 
35
 
36
+ return app
 
 
 
 
 
 
37
 
38
+ def main():
39
+ """Initialize and launch the Gradio interface"""
40
+ ui = InvoiceUI()
41
+ app = ui.create_interface()
42
  app.launch(debug=True)
43
 
44
+ if __name__ == "__main__":
45
+ main()
46
+
47
+