File size: 11,306 Bytes
275359b
 
 
 
 
 
 
 
 
 
 
7698d94
275359b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7698d94
275359b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9aedcaa
275359b
9aedcaa
 
275359b
 
 
 
 
c535f48
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
import os

#DSPY
import dspy
from dspy import Prediction
from dspy.evaluate import Evaluate
from dspy import Prediction
from dspy.teleprompt import BootstrapFewShot
from dspy.teleprompt import BootstrapFewShotWithRandomSearch

# Data handling
# import pandas as pd

# Calculations and formatting
import re
from decimal import Decimal

# UI
import gradio as gr
from gradio_pdf import PDF

# PDF handling
import pdfplumber


pdf_examples_dir = './pdfexamples/'

model = dspy.OpenAI(
    model='gpt-3.5-turbo-0125',
    api_key=os.getenv('OPENAI_PROJECT_KEY'),
    max_tokens=2000,
    temperature=0.01)

dspy.settings.configure(lm=model)


# Utils
def parse_CSV_string(csv_string):
  # Parses a CSV string into a unique list
  return list(set(map(str.lower, map(str.strip, csv_string.split(',')))))

def parse_list_of_CSV_strings(list_of_csv_strings):
  # Parses a list of CSV strings with invoice numbers into a list of lists
  parsed_csv_list = []
  for csv_string in list_of_csv_strings:
    parsed_csv_list.append(parse_CSV_string(csv_string))
  return parsed_csv_list

def parse_invoice_number(s):
  # Return the invoice number in Siemens' format if found, otherwise just return the string
  rp = r'^\s*?([\S\d]+\d{6})'
  m = re.search(rp, s)
  return m.group(1) if m else s

def standardize_number(s):
    # Find the last occurrence of a comma or period
    last_separator_index = max(s.rfind(','), s.rfind('.'))
    if last_separator_index != -1:
        # Split the string into two parts
        before_separator = s[:last_separator_index]
        after_separator = s[last_separator_index+1:]

        # Clean the first part of any commas, periods, or whitespace
        before_separator_cleaned = re.sub(r'[.,\s]', '', before_separator)

        # Ensure the decimal part starts with a period, even if it was a comma
        standardized_s = before_separator_cleaned + '.' + after_separator
    else:
        # If there's no separator, just remove commas, periods, or whitespace
        standardized_s = re.sub(r'[.,\s]', '', s)

    return standardized_s

def remove_chars_after_last_digit(s):
    # Remove any non-digit characters following the last digit in the string
    return re.sub(r'(?<=\d)[^\d]*$', '', s)

def clean_text(s):
    # This pattern looks for:
    # - Optional non-digit or non-negative sign characters followed by whitespace (if any)
    # - Followed by any characters until a digit is found in the word
    # It then replaces this matched portion with the remaining part of the word from the first digit
    # cleaned_s = re.sub(r'\S*?\s*?(\S*\d\S*)', r'\1', s)
    cleaned_s = re.sub(r'[^\d-]*\s?(\S*\d\S*)', r'\1', s)
    return cleaned_s

def format_text_decimal(text_decimal):
  # Run functions to format a text decimal
  return clean_text(remove_chars_after_last_digit(standardize_number(text_decimal.strip().lower())))


# PDF handling
def extract_text_using_pdfplumber(file_path):
  # TODO: add check for text vs images padf
  with pdfplumber.open(file_path) as pdf:
    extracted_text = ''
    for i, page in enumerate(pdf.pages):
      # Remove duplicate characters from the page.
      deduped_page = page.dedupe_chars(tolerance=1)
      extracted_text += deduped_page.extract_text()
  return extracted_text

def get_PDF_examples(directory):
  example_pdf_files = []
  for filename in os.listdir(directory):
      if filename.endswith('.pdf'):
          example_pdf_files.append(os.path.join(directory, filename))
  return example_pdf_files


# Signatures and Models
class FindInvoiceNumberColumns(dspy.Signature):
  """Given an input remittance letter, return a list of column header names that may contain invoice numbers."""
  content = dspy.InputField(desc="remittance letter", format=lambda s:s) # s:s so it doesn't skip the new lines
  column_header_names = dspy.OutputField(desc="comma-separated list of column header names that may contain "\
                                                "invoice numbers")

class InvoiceColumnHeaders(dspy.Module):
  def __init__(self):
    super().__init__()

    # self.potential_invoice_column_headers = dspy.ChainOfThought(FindInvoiceNumberColumns)
    self.potential_invoice_column_headers = dspy.Predict(FindInvoiceNumberColumns) # Ervin suggests Predict

  def forward(self, file_content):
    prediction = self.potential_invoice_column_headers(content=file_content)
    # NOTE: Instead of a prediction we could return a simple list (for consistency with my other Modules)
    # or even a parsed list (not CSV)
    return prediction

    # This creates a new Prediction object adding the File Content
    # return Prediction(content=file_content, column_header_names=prediction.column_header_names, rationale=prediction.rationale)
    # Creating a new Prediction object with extra data can be useful if we need more data for the verification

class FindInvoiceList(dspy.Signature):
  """Given an input remittance letter and a column header name output a comma-separated list of all invoice numbers """\
  """that belong to that column."""
  content = dspy.InputField(desc="remittance letter", format=lambda s:s) # s:s so it doesn't skip the new lines
  invoice_column_header = dspy.InputField(desc="invoice column header name")
  candidate_invoice_numbers = dspy.OutputField(desc="comma-separated list of invoice numbers")

class InvoiceList(dspy.Module):
  def __init__(self):
    super().__init__()
    self.find_invoice_headers = InvoiceColumnHeaders() # here we could load a compiled program also
    self.find_invoice_numbers = dspy.Predict(FindInvoiceList)

  def forward(self, file_content):
    # Predict column headers (returns a Prediction with a CSV string in "column_header_names")
    predict_column_headers = self.find_invoice_headers(file_content=file_content)
    # Parse CSV into a list
    potential_invoice_column_headers = parse_CSV_string(predict_column_headers.column_header_names)

    potential_invoices = []

    for header in potential_invoice_column_headers:
      prediction = self.find_invoice_numbers(content=file_content, invoice_column_header=header)
      potential_invoices.append(prediction.candidate_invoice_numbers)

    # Remove duplicates
    # potential_invoices = list(set(potential_invoices))
    potential_invoices = parse_list_of_CSV_strings(potential_invoices) # TODO: remove duplicated lists
    # return Prediction(candidate_invoice_numbers=candidates, column_header_names=col_names)
    # return potential_invoices
    # We need to return a Prediction for the Evaluate function later on
    return Prediction(candidate_invoice_numbers=potential_invoices)

class FindTotalAmountColumns(dspy.Signature):
  """Given an input remittance letter, return a list of column header names that may contain the total payment amount."""
  content = dspy.InputField(desc="remittance letter", format=lambda s:s) # s:s so it doesn't skip the new lines
  total_column_header_names = dspy.OutputField(desc="comma-separated list of column header names that may contain "\
                                                "the remittance letter total payment amount")

class TotalAmountColumnHeaders(dspy.Module):
  def __init__(self):
    super().__init__()
    self.potential_total_amount_column_headers = dspy.Predict(FindTotalAmountColumns)

  def forward(self, file_content):
    prediction = self.potential_total_amount_column_headers(content=file_content)
    return prediction

class FindTotalAmount(dspy.Signature):
  """Given an input remittance letter and a column header name output the total payment amount """\
  """that belongs to that column."""
  content = dspy.InputField(desc="remittance letter", format=lambda s:s) # s:s so it doesn't skip the new lines
  total_amount_column_header = dspy.InputField(desc="total amount header name")
  total_amount = dspy.OutputField(desc="total payment amount")

class RemittanceLetterTotalAmount(dspy.Module):
  def __init__(self):
    super().__init__()
    # self.find_invoice_list = InvoiceList()
    self.find_total_amount_header = TotalAmountColumnHeaders()
    self.find_total_amount = dspy.Predict(FindTotalAmount)

  def forward(self, file_content):
    # Predict invoice list - we could do this here, but let's just call the 2 modules from a function instead
    # if we called the invoice list prediction here, we should return an object with both the potential total amounts
    # and the potential invoice lists
    # predict_invoice_list = self.find_invoice_list(file_content=file_content)

    # Predict column headers (returns a Prediction with a CSV string in "column_header_names")
    predict_column_headers = self.find_total_amount_header(file_content=file_content)
    # Parse CSV into a list
    potential_total_amount_column_headers = parse_CSV_string(predict_column_headers.total_column_header_names)

    potential_total_amounts = []

    for header in potential_total_amount_column_headers:
      prediction = self.find_total_amount(content=file_content, total_amount_column_header=header)
      potential_total_amounts.append(prediction.total_amount)

    # Remove duplicates
    potential_total_amounts = list(set(potential_total_amounts))
    return Prediction(candidate_total_amounts=potential_total_amounts) # I could just return "prediction" also (references to candidate_total_amounts should change then)


# Pipeline
def poc_production_pipeline_without_verification(file_content):
  # TODO: place this in a module - init allows to pass a compiled module and forward handles the data:
  # so we can evaluate the pipeline (check if any tuple matches the verifier)

  # Get invoice candidates
  invoice_list_baseline = InvoiceList()
  candidate_invoices = invoice_list_baseline(file_content=file_content).candidate_invoice_numbers

  # Get total amount candidates
  total_amount_baseline = RemittanceLetterTotalAmount()

  # Format all decimals
  candidate_total_amounts = list(map(format_text_decimal,
                                total_amount_baseline(file_content=file_content).candidate_total_amounts))


  # For UI visualisation purposes, create a list of tuples where the second tuple value is empty
  candidate_invoices_for_UI = []
  candidate_total_amounts_for_UI = []

  for candidate in candidate_invoices:
      candidate_invoices_for_UI.append((candidate,))

  for candidate in candidate_total_amounts:
      candidate_total_amounts_for_UI.append((candidate,))

  return candidate_invoices_for_UI, candidate_total_amounts_for_UI

def poc_production_pipeline_without_verification_from_PDF(file_path):
  file_content = extract_text_using_pdfplumber(file_path)
  # return str(poc_production_pipeline_without_verification(file_content))
  return poc_production_pipeline_without_verification(file_content)


# Main app
fake_PDF_examples = get_PDF_examples(pdf_examples_dir)

remittance_letter_demo_without_verification_from_PDF = gr.Interface(
  poc_production_pipeline_without_verification_from_PDF,
  [PDF(label="Remittance advice", height=1000)],
  [
    gr.Dataframe(col_count=(1, 'fixed'), label="", headers=["Retrieved invoice proposals"], wrap=True),
    gr.Dataframe(col_count=(1, 'fixed'), label="", headers=["Retrieved total amount proposals"], wrap=True)
  ],
  examples=fake_PDF_examples,
  allow_flagging='never'
)

remittance_letter_demo_without_verification_from_PDF.launch()