Alejandro-STC commited on
Commit
8de3fa1
·
verified ·
1 Parent(s): 6f94dc6

Add verification functionality and improve UI

Browse files
Files changed (1) hide show
  1. app.py +98 -33
app.py CHANGED
@@ -25,6 +25,12 @@ import pdfplumber
25
 
26
  pdf_examples_dir = './pdfexamples/'
27
 
 
 
 
 
 
 
28
  model = dspy.OpenAI(
29
  model='gpt-3.5-turbo-0125',
30
  api_key=os.getenv('OPENAI_PROJECT_KEY'),
@@ -147,7 +153,7 @@ def get_PDF_examples(directory):
147
  example_pdf_files = []
148
  for filename in os.listdir(directory):
149
  if filename.endswith('.pdf'):
150
- example_pdf_files.append(os.path.join(directory, filename))
151
  return example_pdf_files
152
 
153
 
@@ -155,7 +161,7 @@ def get_PDF_examples(directory):
155
  class FindInvoiceNumberColumns(dspy.Signature):
156
  """Given an input remittance letter, return a list of column header names that may contain invoice numbers."""
157
  content = dspy.InputField(desc="remittance letter", format=lambda s:s) # s:s so it doesn't skip the new lines
158
- column_header_names = dspy.OutputField(desc="comma-separated list of column header names that may contain "\
159
  "invoice numbers")
160
 
161
  class InvoiceColumnHeaders(dspy.Module):
@@ -177,7 +183,7 @@ class InvoiceColumnHeaders(dspy.Module):
177
  return Prediction(column_header_names=unique_headers)
178
 
179
  class FindInvoiceList(dspy.Signature):
180
- """Given an input remittance letter and a column header name output a comma-separated list of all invoice numbers """\
181
  """that belong to that column."""
182
  content = dspy.InputField(desc="remittance letter", format=lambda s:s) # s:s so it doesn't skip the new lines
183
  invoice_column_header = dspy.InputField(desc="invoice column header name")
@@ -214,7 +220,7 @@ class InvoiceList(dspy.Module):
214
  class FindTotalAmountColumns(dspy.Signature):
215
  """Given an input remittance letter, return a list of column header names that may contain the total payment amount."""
216
  content = dspy.InputField(desc="remittance letter", format=lambda s:s) # s:s so it doesn't skip the new lines
217
- total_column_header_names = dspy.OutputField(desc="comma-separated list of column header names that may contain "\
218
  "the remittance letter total payment amount")
219
 
220
  class TotalAmountColumnHeaders(dspy.Module):
@@ -227,7 +233,7 @@ class TotalAmountColumnHeaders(dspy.Module):
227
  return prediction
228
 
229
  class FindTotalAmount(dspy.Signature):
230
- """Given an input remittance letter and a column header name output the total payment amount """\
231
  """that belongs to that column."""
232
  content = dspy.InputField(desc="remittance letter", format=lambda s:s) # s:s so it doesn't skip the new lines
233
  total_amount_column_header = dspy.InputField(desc="total amount header name")
@@ -256,13 +262,13 @@ class RemittanceLetterTotalAmount(dspy.Module):
256
  return Prediction(candidate_total_amounts=potential_total_amounts)
257
 
258
 
259
- # Pipeline
260
- def poc_production_pipeline_without_verification(file_content):
261
  # Get invoice candidates
262
  invoice_list_baseline = InvoiceList()
263
  candidate_invoices = invoice_list_baseline(file_content=file_content).candidate_invoice_numbers
264
 
265
- candidate_invoices = [",".join(lst) for lst in candidate_invoices]
266
 
267
  # Get total amount candidates
268
  total_amount_baseline = RemittanceLetterTotalAmount()
@@ -273,35 +279,94 @@ def poc_production_pipeline_without_verification(file_content):
273
  # Only keep unique amounts
274
  candidate_total_amounts = list(set(candidate_total_amounts))
275
 
276
- # For UI visualisation purposes, create a list of tuples where the second tuple value is empty
277
- candidate_invoices_for_UI = []
278
- candidate_total_amounts_for_UI = []
279
 
 
280
  for candidate in candidate_invoices:
281
- candidate_invoices_for_UI.append((candidate,))
 
282
 
 
 
 
283
  for candidate in candidate_total_amounts:
284
- candidate_total_amounts_for_UI.append((candidate,))
285
-
286
- return candidate_invoices_for_UI, candidate_total_amounts_for_UI
287
-
288
- def poc_production_pipeline_without_verification_from_PDF(file_path):
289
- file_content = extract_text_using_pdfplumber(file_path)
290
- return poc_production_pipeline_without_verification(file_content)
291
-
292
 
293
- # Main app
294
- fake_PDF_examples = get_PDF_examples(pdf_examples_dir)
 
 
 
295
 
296
- remittance_letter_demo_without_verification_from_PDF = gr.Interface(
297
- poc_production_pipeline_without_verification_from_PDF,
298
- [PDF(label="Remittance advice", height=1000)],
299
- [
300
- gr.Dataframe(col_count=(1, 'fixed'), label="", headers=["Retrieved invoice proposals"], wrap=True),
301
- gr.Dataframe(col_count=(1, 'fixed'), label="", headers=["Retrieved total amount proposals"], wrap=True)
302
- ],
303
- examples=fake_PDF_examples,
304
- allow_flagging='never'
305
- )
306
 
307
- remittance_letter_demo_without_verification_from_PDF.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  pdf_examples_dir = './pdfexamples/'
27
 
28
+ # model = dspy.LM(
29
+ # model='gpt-3.5-turbo',
30
+ # api_key=os.getenv('OPENAI_PROJECT_KEY'),
31
+ # max_tokens=2000,
32
+ # temperature=0.01)
33
+
34
  model = dspy.OpenAI(
35
  model='gpt-3.5-turbo-0125',
36
  api_key=os.getenv('OPENAI_PROJECT_KEY'),
 
153
  example_pdf_files = []
154
  for filename in os.listdir(directory):
155
  if filename.endswith('.pdf'):
156
+ example_pdf_files.append([os.path.join(directory, filename), '', ''])
157
  return example_pdf_files
158
 
159
 
 
161
  class FindInvoiceNumberColumns(dspy.Signature):
162
  """Given an input remittance letter, return a list of column header names that may contain invoice numbers."""
163
  content = dspy.InputField(desc="remittance letter", format=lambda s:s) # s:s so it doesn't skip the new lines
164
+ column_header_names = dspy.OutputField(desc="comma-separated list of column header names that may contain "
165
  "invoice numbers")
166
 
167
  class InvoiceColumnHeaders(dspy.Module):
 
183
  return Prediction(column_header_names=unique_headers)
184
 
185
  class FindInvoiceList(dspy.Signature):
186
+ """Given an input remittance letter and a column header name output a comma-separated list of all invoice numbers """
187
  """that belong to that column."""
188
  content = dspy.InputField(desc="remittance letter", format=lambda s:s) # s:s so it doesn't skip the new lines
189
  invoice_column_header = dspy.InputField(desc="invoice column header name")
 
220
  class FindTotalAmountColumns(dspy.Signature):
221
  """Given an input remittance letter, return a list of column header names that may contain the total payment amount."""
222
  content = dspy.InputField(desc="remittance letter", format=lambda s:s) # s:s so it doesn't skip the new lines
223
+ total_column_header_names = dspy.OutputField(desc="comma-separated list of column header names that may contain "
224
  "the remittance letter total payment amount")
225
 
226
  class TotalAmountColumnHeaders(dspy.Module):
 
233
  return prediction
234
 
235
  class FindTotalAmount(dspy.Signature):
236
+ """Given an input remittance letter and a column header name output the total payment amount """
237
  """that belongs to that column."""
238
  content = dspy.InputField(desc="remittance letter", format=lambda s:s) # s:s so it doesn't skip the new lines
239
  total_amount_column_header = dspy.InputField(desc="total amount header name")
 
262
  return Prediction(candidate_total_amounts=potential_total_amounts)
263
 
264
 
265
+ # Pipeline with Verification
266
+ def poc_production_pipeline_with_verification(file_content, verification_invoices, verification_total_amount):
267
  # Get invoice candidates
268
  invoice_list_baseline = InvoiceList()
269
  candidate_invoices = invoice_list_baseline(file_content=file_content).candidate_invoice_numbers
270
 
271
+ candidate_invoices = [','.join(sorted(lst)) for lst in candidate_invoices]
272
 
273
  # Get total amount candidates
274
  total_amount_baseline = RemittanceLetterTotalAmount()
 
279
  # Only keep unique amounts
280
  candidate_total_amounts = list(set(candidate_total_amounts))
281
 
282
+ # Verify invoices
283
+ verification_invoices_list = parse_CSV_string_to_unique(verification_invoices)
284
+ verification_invoices_list_sorted = ','.join(sorted(verification_invoices_list))
285
 
286
+ validated_invoices = []
287
  for candidate in candidate_invoices:
288
+ if candidate == verification_invoices_list_sorted:
289
+ validated_invoices.append(candidate)
290
 
291
+ # Verify total amount
292
+ verification_total_amount_formatted = format_text_decimal(verification_total_amount)
293
+ validated_total_amount = []
294
  for candidate in candidate_total_amounts:
295
+ if candidate == verification_total_amount_formatted:
296
+ validated_total_amount.append(candidate)
 
 
 
 
 
 
297
 
298
+ # Prepare output for UI
299
+ candidate_invoices_for_UI = [(candidate,) for candidate in candidate_invoices]
300
+ candidate_total_amounts_for_UI = [(candidate,) for candidate in candidate_total_amounts]
301
+ validated_invoices_for_UI = [(validated,) for validated in validated_invoices]
302
+ validated_total_amount_for_UI = [(validated,) for validated in validated_total_amount]
303
 
304
+ return candidate_invoices_for_UI, candidate_total_amounts_for_UI, validated_invoices_for_UI, validated_total_amount_for_UI
 
 
 
 
 
 
 
 
 
305
 
306
+ def poc_production_pipeline_with_verification_from_PDF(file_path, verification_invoices, verification_total_amount):
307
+ file_content = extract_text_using_pdfplumber(file_path)
308
+ return poc_production_pipeline_with_verification(file_content, verification_invoices, verification_total_amount)
309
+
310
+
311
+
312
+ # Main app function
313
+ def main():
314
+ fake_PDF_examples = get_PDF_examples(pdf_examples_dir)
315
+
316
+ # remittance_letter_demo_with_verification_from_PDF = gr.Interface(
317
+ # poc_production_pipeline_with_verification_from_PDF,
318
+ # [
319
+ # PDF(label="Remittance advice", height=800),
320
+ # gr.Textbox(label="Verification Invoices (comma-separated)", placeholder="Enter invoice numbers here..."),
321
+ # gr.Textbox(label="Verification Total Amount", placeholder="Enter total amount here...")
322
+ # ],
323
+ # [
324
+ # gr.Dataframe(col_count=(1, 'fixed'), label="", headers=["Retrieved Invoice Proposals"], wrap=True),
325
+ # gr.Dataframe(col_count=(1, 'fixed'), label="", headers=["Retrieved Total Amount Proposals"], wrap=True),
326
+ # gr.Dataframe(col_count=(1, 'fixed'), label="", headers=["Validated Invoices"], wrap=True),
327
+ # gr.Dataframe(col_count=(1, 'fixed'), label="", headers=["Validated Total Amount"], wrap=True)
328
+ # ],
329
+ # examples=fake_PDF_examples,
330
+ # allow_flagging='never'
331
+ # )
332
+
333
+ with gr.Blocks() as remittance_demo:
334
+ gr.Markdown("# Remittance PDF Processor")
335
+ gr.Markdown("Upload a PDF file to extract invoice numbers and payment amounts. Provide verification data if available for comparison.")
336
+
337
+ with gr.Row():
338
+ with gr.Column():
339
+ pdf_input = PDF(label="Remittance advice", height=900)
340
+
341
+ with gr.Column():
342
+ with gr.Accordion("Verification Inputs", open=False):
343
+ verification_invoices = gr.Textbox(label="Verification Invoices (comma-separated)", placeholder="Enter invoice numbers here...")
344
+ verification_total_amount = gr.Textbox(label="Verification Total Amount", placeholder="Enter total amount here...")
345
+
346
+ retrieved_invoices = gr.Dataframe(col_count=(1, 'fixed'), label="", headers=["Retrieved Invoice Proposals"], wrap=True)
347
+ retrieved_amounts = gr.Dataframe(col_count=(1, 'fixed'), label="", headers=["Retrieved Total Amount Proposals"], wrap=True)
348
+ validated_invoices = gr.Dataframe(col_count=(1, 'fixed'), label="", headers=["Validated Invoices"], wrap=True)
349
+ validated_total_amount = gr.Dataframe(col_count=(1, 'fixed'), label="", headers=["Validated Total Amount"], wrap=True)
350
+
351
+ submit_button = gr.Button("Process document")
352
+
353
+ submit_button.click(
354
+ poc_production_pipeline_with_verification_from_PDF,
355
+ inputs=[pdf_input, verification_invoices, verification_total_amount],
356
+ outputs=[retrieved_invoices, retrieved_amounts, validated_invoices, validated_total_amount]
357
+ )
358
+
359
+ gr.Examples(
360
+ examples=[[pdf[0]] for pdf in fake_PDF_examples], # We do this so only PDFs are shown
361
+ inputs=[pdf_input],
362
+ outputs=[retrieved_invoices, retrieved_amounts, validated_invoices, validated_total_amount],
363
+ fn=poc_production_pipeline_with_verification_from_PDF,
364
+ cache_examples=True
365
+ )
366
+ remittance_demo.launch()
367
+
368
+
369
+
370
+ # Run the main app if the file is executed directly
371
+ if __name__ == "__main__":
372
+ main()