File size: 11,900 Bytes
275359b
 
 
 
 
 
 
 
 
 
 
7698d94
275359b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7698d94
275359b
 
 
 
 
 
 
 
a8e9aaf
 
 
 
 
275359b
 
 
 
 
 
 
a8e9aaf
275359b
 
a8e9aaf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6f94dc6
a8e9aaf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275359b
6f94dc6
275359b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6f94dc6
275359b
 
 
6f94dc6
275359b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a8e9aaf
 
 
 
 
 
275359b
a8e9aaf
 
275359b
 
 
a8e9aaf
 
 
 
275359b
 
 
 
 
 
 
 
 
a8e9aaf
 
 
 
 
 
 
 
 
275359b
a8e9aaf
 
275359b
 
 
 
a8e9aaf
275359b
a8e9aaf
275359b
 
a8e9aaf
 
275359b
a8e9aaf
 
275359b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a8e9aaf
275359b
 
 
 
 
 
 
 
 
6f94dc6
275359b
 
 
 
 
 
 
 
a8e9aaf
 
275359b
 
 
 
 
 
4df9fa6
 
275359b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9aedcaa
275359b
9aedcaa
 
275359b
 
 
 
 
2417224
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
import os

#DSPY
import dspy
from dspy import Prediction
from dspy.evaluate import Evaluate
from dspy import Prediction
from dspy.teleprompt import BootstrapFewShot
from dspy.teleprompt import BootstrapFewShotWithRandomSearch

# Data handling
# import pandas as pd

# Calculations and formatting
import re
from decimal import Decimal

# UI
import gradio as gr
from gradio_pdf import PDF

# PDF handling
import pdfplumber


pdf_examples_dir = './pdfexamples/'

model = dspy.OpenAI(
    model='gpt-3.5-turbo-0125',
    api_key=os.getenv('OPENAI_PROJECT_KEY'),
    max_tokens=2000,
    temperature=0.01)

dspy.settings.configure(lm=model)


# Utils
def parse_CSV_string(csv_string):
  # Parses a CSV string into a list
  return list(map(str.strip, csv_string.split(',')))


def parse_CSV_string_to_unique(csv_string):
  # Parses a CSV string into a unique list
  return list(set(map(str.lower, map(str.strip, csv_string.split(',')))))

def parse_list_of_CSV_strings(list_of_csv_strings):
  # Parses a list of CSV strings with invoice numbers into a list of lists
  parsed_csv_list = []
  for csv_string in list_of_csv_strings:
    parsed_csv_list.append(parse_CSV_string_to_unique(csv_string))
  return parsed_csv_list

def parse_column_names(s):
  """
  Parse a comma-separated list of column names from a string.
  Removes the prefix string before splitting the string.
  Args:
    s: raw response from the model, comma-separated list of column names (string)
  Returns:
    list of column names (list of strings)
  """
  prefix = 'Column Header Names:'
  prefix_length = len(prefix)
  # r_index = s.rfind(prefix)
  # s = s[r_index+prefix_length:] if r_index != -1 else s
  if s.strip().lower().startswith(prefix.lower()):
    s = s[prefix_length:]
  return list(map(str.strip,s.split(',')))

def remove_duplicate_lists(lists):
    """
    Remove duplicate lists from a list of lists.
    Args:
      lists:
        a list of lists of strings
    Returns:
        a list of lists of strings, where each list is unique
    """
    seen = set()
    unique_lists = []

    for lst in lists:
        sorted_list = tuple(sorted(lst))
        if sorted_list not in seen:
            seen.add(sorted_list)
            unique_lists.append(lst)

    return unique_lists


def parse_invoice_number(s):
  # Return the invoice number in a specific format if found, otherwise just return the input string
  rp = r'^\s*?([\S\d]+\d{6})'
  m = re.search(rp, s)
  return m.group(1) if m else s

def standardize_number(s):
    # Find the last occurrence of a comma or period
    last_separator_index = max(s.rfind(','), s.rfind('.'))
    if last_separator_index != -1:
        # Split the string into two parts
        before_separator = s[:last_separator_index]
        after_separator = s[last_separator_index+1:]

        # Clean the first part of any commas, periods, or whitespace
        before_separator_cleaned = re.sub(r'[.,\s]', '', before_separator)

        # Ensure the decimal part starts with a period, even if it was a comma
        standardized_s = before_separator_cleaned + '.' + after_separator
    else:
        # If there's no separator, just remove commas, periods, or whitespace
        standardized_s = re.sub(r'[.,\s]', '', s)

    return standardized_s

def remove_chars_after_last_digit(s):
    # Remove any non-digit characters following the last digit in the string
    return re.sub(r'(?<=\d)[^\d]*$', '', s)

def clean_text(s):
    # This pattern looks for:
    # - Optional non-digit or non-negative sign characters followed by whitespace (if any)
    # - Followed by any characters until a digit is found in the word
    # It then replaces this matched portion with the remaining part of the word from the first digit
    # cleaned_s = re.sub(r'\S*?\s*?(\S*\d\S*)', r'\1', s)
    cleaned_s = re.sub(r'[^\d-]*\s?(\S*\d\S*)', r'\1', s)
    return cleaned_s

def format_text_decimal(text_decimal):
  # Run functions to format a text decimal
  return clean_text(remove_chars_after_last_digit(standardize_number(text_decimal.strip().lower())))


# PDF handling
def extract_text_using_pdfplumber(file_path):
  # TODO: add check for text vs image PDF
  with pdfplumber.open(file_path) as pdf:
    extracted_text = ''
    for i, page in enumerate(pdf.pages):
      # Remove duplicate characters from the page
      deduped_page = page.dedupe_chars(tolerance=1)
      extracted_text += deduped_page.extract_text()
  return extracted_text

def get_PDF_examples(directory):
  example_pdf_files = []
  for filename in os.listdir(directory):
      if filename.endswith('.pdf'):
          example_pdf_files.append(os.path.join(directory, filename))
  return example_pdf_files


# Signatures and Models
class FindInvoiceNumberColumns(dspy.Signature):
  """Given an input remittance letter, return a list of column header names that may contain invoice numbers."""
  content = dspy.InputField(desc="remittance letter", format=lambda s:s) # s:s so it doesn't skip the new lines
  column_header_names = dspy.OutputField(desc="comma-separated list of column header names that may contain "\
                                                "invoice numbers")

class InvoiceColumnHeaders(dspy.Module):
  """
  Predict the column headers containing invoice numbers from the remittance letter.
  Attributes:
    response_parser: a function that takes a string and returns a list of strings.
  """
  def __init__(self, response_parser=parse_CSV_string):
    super().__init__()
    self.response_parser = response_parser
    self.potential_invoice_column_headers = dspy.Predict(FindInvoiceNumberColumns)

  def forward(self, file_content):
    prediction = self.potential_invoice_column_headers(content=file_content)
    # Remove duplicates from the prediction
    unique_headers = list(set(self.response_parser(prediction.column_header_names)))
    # Create a new Prediction object with the unique headers
    return Prediction(column_header_names=unique_headers)

class FindInvoiceList(dspy.Signature):
  """Given an input remittance letter and a column header name output a comma-separated list of all invoice numbers """\
  """that belong to that column."""
  content = dspy.InputField(desc="remittance letter", format=lambda s:s) # s:s so it doesn't skip the new lines
  invoice_column_header = dspy.InputField(desc="invoice column header name")
  candidate_invoice_numbers = dspy.OutputField(desc="comma-separated list of invoice numbers")

class InvoiceList(dspy.Module):
  """
  Retrieves a list of list of potential invoice numbers from a remittance letter.
  Attributes:
    response_parser: A function that takes a string and returns a list of invoice numbers.
  Returns:
    A Prediction object with the following fields:
      candidate_invoice_numbers: A list of lists of invoice numbers.
  """
  def __init__(self, response_parser=parse_CSV_string_to_unique):
    super().__init__()
    self.response_parser = response_parser
    self.find_invoice_headers = InvoiceColumnHeaders(response_parser=parse_column_names) # here we could load a compiled program also
    self.find_invoice_numbers = dspy.Predict(FindInvoiceList)

  def forward(self, file_content):
    predict_column_headers = self.find_invoice_headers(file_content=file_content)
    potential_invoice_column_headers = predict_column_headers.column_header_names

    candidates = []
    for header in potential_invoice_column_headers:
      prediction = self.find_invoice_numbers(content=file_content, invoice_column_header=header)
      invoice_number_list = self.response_parser(prediction.candidate_invoice_numbers)
      candidates.append(invoice_number_list)
    # Remove duplicates
    candidates = remove_duplicate_lists(candidates)
    return Prediction(candidate_invoice_numbers=candidates)

class FindTotalAmountColumns(dspy.Signature):
  """Given an input remittance letter, return a list of column header names that may contain the total payment amount."""
  content = dspy.InputField(desc="remittance letter", format=lambda s:s) # s:s so it doesn't skip the new lines
  total_column_header_names = dspy.OutputField(desc="comma-separated list of column header names that may contain "\
                                                "the remittance letter total payment amount")

class TotalAmountColumnHeaders(dspy.Module):
  def __init__(self):
    super().__init__()
    self.potential_total_amount_column_headers = dspy.Predict(FindTotalAmountColumns)

  def forward(self, file_content):
    prediction = self.potential_total_amount_column_headers(content=file_content)
    return prediction

class FindTotalAmount(dspy.Signature):
  """Given an input remittance letter and a column header name output the total payment amount """\
  """that belongs to that column."""
  content = dspy.InputField(desc="remittance letter", format=lambda s:s) # s:s so it doesn't skip the new lines
  total_amount_column_header = dspy.InputField(desc="total amount header name")
  total_amount = dspy.OutputField(desc="total payment amount")

class RemittanceLetterTotalAmount(dspy.Module):
  def __init__(self):
    super().__init__()
    self.find_total_amount_header = TotalAmountColumnHeaders()
    self.find_total_amount = dspy.Predict(FindTotalAmount)

  def forward(self, file_content):
    # Predict column headers (returns a Prediction with a CSV string in "column_header_names")
    predict_column_headers = self.find_total_amount_header(file_content=file_content)
    # Parse CSV into a list
    potential_total_amount_column_headers = parse_CSV_string_to_unique(predict_column_headers.total_column_header_names)

    potential_total_amounts = []

    for header in potential_total_amount_column_headers:
      prediction = self.find_total_amount(content=file_content, total_amount_column_header=header)
      potential_total_amounts.append(prediction.total_amount)

    # Remove duplicates
    potential_total_amounts = list(set(potential_total_amounts))
    return Prediction(candidate_total_amounts=potential_total_amounts)


# Pipeline
def poc_production_pipeline_without_verification(file_content):
  # Get invoice candidates
  invoice_list_baseline = InvoiceList()
  candidate_invoices = invoice_list_baseline(file_content=file_content).candidate_invoice_numbers

  candidate_invoices = [",".join(lst) for lst in candidate_invoices]

  # Get total amount candidates
  total_amount_baseline = RemittanceLetterTotalAmount()

  # Format all decimals
  candidate_total_amounts = list(map(format_text_decimal,
                                total_amount_baseline(file_content=file_content).candidate_total_amounts))
  # Only keep unique amounts 
  candidate_total_amounts = list(set(candidate_total_amounts))

  # For UI visualisation purposes, create a list of tuples where the second tuple value is empty
  candidate_invoices_for_UI = []
  candidate_total_amounts_for_UI = []

  for candidate in candidate_invoices:
      candidate_invoices_for_UI.append((candidate,))

  for candidate in candidate_total_amounts:
      candidate_total_amounts_for_UI.append((candidate,))

  return candidate_invoices_for_UI, candidate_total_amounts_for_UI

def poc_production_pipeline_without_verification_from_PDF(file_path):
  file_content = extract_text_using_pdfplumber(file_path)
  return poc_production_pipeline_without_verification(file_content)


# Main app
fake_PDF_examples = get_PDF_examples(pdf_examples_dir)

remittance_letter_demo_without_verification_from_PDF = gr.Interface(
  poc_production_pipeline_without_verification_from_PDF,
  [PDF(label="Remittance advice", height=1000)],
  [
    gr.Dataframe(col_count=(1, 'fixed'), label="", headers=["Retrieved invoice proposals"], wrap=True),
    gr.Dataframe(col_count=(1, 'fixed'), label="", headers=["Retrieved total amount proposals"], wrap=True)
  ],
  examples=fake_PDF_examples,
  allow_flagging='never'
)

remittance_letter_demo_without_verification_from_PDF.launch()