Spaces:

Saltech
/

remittance-processing

Running

App Files Files Community

eelang commited on Jul 22, 2024

Commit

a8e9aaf

verified ·

1 Parent(s): 9fb906a

Refactored pipeline, added doc

Browse files

Files changed (1) hide show

app.py +76 -28

app.py CHANGED Viewed

@@ -36,6 +36,11 @@ dspy.settings.configure(lm=model)
 # Utils
 def parse_CSV_string(csv_string):
   # Parses a CSV string into a unique list
   return list(set(map(str.lower, map(str.strip, csv_string.split(',')))))
@@ -43,9 +48,47 @@ def parse_list_of_CSV_strings(list_of_csv_strings):
   # Parses a list of CSV strings with invoice numbers into a list of lists
   parsed_csv_list = []
   for csv_string in list_of_csv_strings:
-    parsed_csv_list.append(parse_CSV_string(csv_string))
   return parsed_csv_list
 def parse_invoice_number(s):
   # Return the invoice number in Siemens' format if found, otherwise just return the string
   rp = r'^\s*?([\S\d]+\d{6})'
@@ -116,21 +159,22 @@ class FindInvoiceNumberColumns(dspy.Signature):
                                                 "invoice numbers")
 class InvoiceColumnHeaders(dspy.Module):
-  def __init__(self):
     super().__init__()
-    # self.potential_invoice_column_headers = dspy.ChainOfThought(FindInvoiceNumberColumns)
-    self.potential_invoice_column_headers = dspy.Predict(FindInvoiceNumberColumns) # Ervin suggests Predict
   def forward(self, file_content):
     prediction = self.potential_invoice_column_headers(content=file_content)
-    # NOTE: Instead of a prediction we could return a simple list (for consistency with my other Modules)
-    # or even a parsed list (not CSV)
-    return prediction
-    # This creates a new Prediction object adding the File Content
-    # return Prediction(content=file_content, column_header_names=prediction.column_header_names, rationale=prediction.rationale)
-    # Creating a new Prediction object with extra data can be useful if we need more data for the verification
 class FindInvoiceList(dspy.Signature):
   """Given an input remittance letter and a column header name output a comma-separated list of all invoice numbers """\
@@ -140,30 +184,32 @@ class FindInvoiceList(dspy.Signature):
   candidate_invoice_numbers = dspy.OutputField(desc="comma-separated list of invoice numbers")
 class InvoiceList(dspy.Module):
-  def __init__(self):
     super().__init__()
-    self.find_invoice_headers = InvoiceColumnHeaders() # here we could load a compiled program also
     self.find_invoice_numbers = dspy.Predict(FindInvoiceList)
   def forward(self, file_content):
-    # Predict column headers (returns a Prediction with a CSV string in "column_header_names")
     predict_column_headers = self.find_invoice_headers(file_content=file_content)
-    # Parse CSV into a list
-    potential_invoice_column_headers = parse_CSV_string(predict_column_headers.column_header_names)
-    potential_invoices = []
     for header in potential_invoice_column_headers:
       prediction = self.find_invoice_numbers(content=file_content, invoice_column_header=header)
-      potential_invoices.append(prediction.candidate_invoice_numbers)
     # Remove duplicates
-    # potential_invoices = list(set(potential_invoices))
-    potential_invoices = parse_list_of_CSV_strings(potential_invoices) # TODO: remove duplicated lists
-    # return Prediction(candidate_invoice_numbers=candidates, column_header_names=col_names)
-    # return potential_invoices
-    # We need to return a Prediction for the Evaluate function later on
-    return Prediction(candidate_invoice_numbers=potential_invoices)
 class FindTotalAmountColumns(dspy.Signature):
   """Given an input remittance letter, return a list of column header names that may contain the total payment amount."""
@@ -203,7 +249,7 @@ class RemittanceLetterTotalAmount(dspy.Module):
     # Predict column headers (returns a Prediction with a CSV string in "column_header_names")
     predict_column_headers = self.find_total_amount_header(file_content=file_content)
     # Parse CSV into a list
-    potential_total_amount_column_headers = parse_CSV_string(predict_column_headers.total_column_header_names)
     potential_total_amounts = []
@@ -225,6 +271,8 @@ def poc_production_pipeline_without_verification(file_content):
   invoice_list_baseline = InvoiceList()
   candidate_invoices = invoice_list_baseline(file_content=file_content).candidate_invoice_numbers
   # Get total amount candidates
   total_amount_baseline = RemittanceLetterTotalAmount()

 # Utils
 def parse_CSV_string(csv_string):
+  # Parses a CSV string into a list
+  return list(map(str.strip, csv_string.split(',')))
+def parse_CSV_string_to_unique(csv_string):
   # Parses a CSV string into a unique list
   return list(set(map(str.lower, map(str.strip, csv_string.split(',')))))
   # Parses a list of CSV strings with invoice numbers into a list of lists
   parsed_csv_list = []
   for csv_string in list_of_csv_strings:
+    parsed_csv_list.append(parse_CSV_string_to_unique(csv_string))
   return parsed_csv_list
+def parse_column_names(s):
+  """
+  Parse a comma-separated list of column names from a string.
+  Removes the prefix string before splitting the string.
+  Args:
+    s: raw response from the model, comma-separated list of column names (string)
+  Returns:
+    list of column names (list of strings)
+  """
+  prefix = 'Column Header Names:'
+  prefix_length = len(prefix)
+  # r_index = s.rfind(prefix)
+  # s = s[r_index+prefix_length:] if r_index != -1 else s
+  if s.strip().lower().startswith(prefix.lower()):
+    s = s[prefix_length:]
+  return list(map(str.strip,s.split(',')))
+def remove_duplicate_lists(lists):
+    """
+        Remove duplicate lists from a list of lists.
+    Args:
+      lists:
+        a list of lists of strings
+    Returns:
+        a list of lists of strings, where each list is unique
+    """
+    seen = set()
+    unique_lists = []
+    for lst in lists:
+        sorted_list = tuple(sorted(lst))
+        if sorted_list not in seen:
+            seen.add(sorted_list)
+            unique_lists.append(lst)
+    return unique_lists
 def parse_invoice_number(s):
   # Return the invoice number in Siemens' format if found, otherwise just return the string
   rp = r'^\s*?([\S\d]+\d{6})'
                                                 "invoice numbers")
 class InvoiceColumnHeaders(dspy.Module):
+  """
+  Predict the column headers containing invoice numbers from the remittance letter.
+  Attributes:
+    response_parser: a function that takes a string and returns a list of strings.
+  """
+  def __init__(self, response_parser=parse_CSV_string):
     super().__init__()
+    self.response_parser = response_parser
+    self.potential_invoice_column_headers = dspy.Predict(FindInvoiceNumberColumns)
   def forward(self, file_content):
     prediction = self.potential_invoice_column_headers(content=file_content)
+    # Remove duplicates from the prediction
+    unique_headers = list(set(self.response_parser(prediction.column_header_names)))
+    # Create a new Prediction object with the unique headers
+    return Prediction(column_header_names=unique_headers)
 class FindInvoiceList(dspy.Signature):
   """Given an input remittance letter and a column header name output a comma-separated list of all invoice numbers """\
   candidate_invoice_numbers = dspy.OutputField(desc="comma-separated list of invoice numbers")
 class InvoiceList(dspy.Module):
+  """
+  Retrieves a list of list of potential invoice numbers from a remittance letter.
+  Attributes:
+    response_parser: A function that takes a string and returns a list of invoice numbers.
+  Returns:
+    A Prediction object with the following fields:
+      candidate_invoice_numbers: A list of lists of invoice numbers.
+  """
+  def __init__(self, response_parser=parse_CSV_string_to_unique):
     super().__init__()
+    self.response_parser = response_parser
+    self.find_invoice_headers = InvoiceColumnHeaders(response_parser=parse_column_names) # here we could load a compiled program also
     self.find_invoice_numbers = dspy.Predict(FindInvoiceList)
   def forward(self, file_content):
     predict_column_headers = self.find_invoice_headers(file_content=file_content)
+    potential_invoice_column_headers = predict_column_headers.column_header_names
+    candidates = []
     for header in potential_invoice_column_headers:
       prediction = self.find_invoice_numbers(content=file_content, invoice_column_header=header)
+      invoice_number_list = self.response_parser(prediction.candidate_invoice_numbers)
+      candidates.append(invoice_number_list)
     # Remove duplicates
+    candidates = remove_duplicate_lists(candidates)
+    return Prediction(candidate_invoice_numbers=candidates)
 class FindTotalAmountColumns(dspy.Signature):
   """Given an input remittance letter, return a list of column header names that may contain the total payment amount."""
     # Predict column headers (returns a Prediction with a CSV string in "column_header_names")
     predict_column_headers = self.find_total_amount_header(file_content=file_content)
     # Parse CSV into a list
+    potential_total_amount_column_headers = parse_CSV_string_to_unique(predict_column_headers.total_column_header_names)
     potential_total_amounts = []
   invoice_list_baseline = InvoiceList()
   candidate_invoices = invoice_list_baseline(file_content=file_content).candidate_invoice_numbers
+  candidate_invoices = [",".join(lst) for lst in candidate_invoices]
   # Get total amount candidates
   total_amount_baseline = RemittanceLetterTotalAmount()