eelang commited on
Commit
a8e9aaf
·
verified ·
1 Parent(s): 9fb906a

Refactored pipeline, added doc

Browse files
Files changed (1) hide show
  1. app.py +76 -28
app.py CHANGED
@@ -36,6 +36,11 @@ dspy.settings.configure(lm=model)
36
 
37
  # Utils
38
  def parse_CSV_string(csv_string):
 
 
 
 
 
39
  # Parses a CSV string into a unique list
40
  return list(set(map(str.lower, map(str.strip, csv_string.split(',')))))
41
 
@@ -43,9 +48,47 @@ def parse_list_of_CSV_strings(list_of_csv_strings):
43
  # Parses a list of CSV strings with invoice numbers into a list of lists
44
  parsed_csv_list = []
45
  for csv_string in list_of_csv_strings:
46
- parsed_csv_list.append(parse_CSV_string(csv_string))
47
  return parsed_csv_list
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  def parse_invoice_number(s):
50
  # Return the invoice number in Siemens' format if found, otherwise just return the string
51
  rp = r'^\s*?([\S\d]+\d{6})'
@@ -116,21 +159,22 @@ class FindInvoiceNumberColumns(dspy.Signature):
116
  "invoice numbers")
117
 
118
  class InvoiceColumnHeaders(dspy.Module):
119
- def __init__(self):
 
 
 
 
 
120
  super().__init__()
121
-
122
- # self.potential_invoice_column_headers = dspy.ChainOfThought(FindInvoiceNumberColumns)
123
- self.potential_invoice_column_headers = dspy.Predict(FindInvoiceNumberColumns) # Ervin suggests Predict
124
 
125
  def forward(self, file_content):
126
  prediction = self.potential_invoice_column_headers(content=file_content)
127
- # NOTE: Instead of a prediction we could return a simple list (for consistency with my other Modules)
128
- # or even a parsed list (not CSV)
129
- return prediction
130
-
131
- # This creates a new Prediction object adding the File Content
132
- # return Prediction(content=file_content, column_header_names=prediction.column_header_names, rationale=prediction.rationale)
133
- # Creating a new Prediction object with extra data can be useful if we need more data for the verification
134
 
135
  class FindInvoiceList(dspy.Signature):
136
  """Given an input remittance letter and a column header name output a comma-separated list of all invoice numbers """\
@@ -140,30 +184,32 @@ class FindInvoiceList(dspy.Signature):
140
  candidate_invoice_numbers = dspy.OutputField(desc="comma-separated list of invoice numbers")
141
 
142
  class InvoiceList(dspy.Module):
143
- def __init__(self):
 
 
 
 
 
 
 
 
144
  super().__init__()
145
- self.find_invoice_headers = InvoiceColumnHeaders() # here we could load a compiled program also
 
146
  self.find_invoice_numbers = dspy.Predict(FindInvoiceList)
147
 
148
  def forward(self, file_content):
149
- # Predict column headers (returns a Prediction with a CSV string in "column_header_names")
150
  predict_column_headers = self.find_invoice_headers(file_content=file_content)
151
- # Parse CSV into a list
152
- potential_invoice_column_headers = parse_CSV_string(predict_column_headers.column_header_names)
153
-
154
- potential_invoices = []
155
 
 
156
  for header in potential_invoice_column_headers:
157
  prediction = self.find_invoice_numbers(content=file_content, invoice_column_header=header)
158
- potential_invoices.append(prediction.candidate_invoice_numbers)
159
-
160
  # Remove duplicates
161
- # potential_invoices = list(set(potential_invoices))
162
- potential_invoices = parse_list_of_CSV_strings(potential_invoices) # TODO: remove duplicated lists
163
- # return Prediction(candidate_invoice_numbers=candidates, column_header_names=col_names)
164
- # return potential_invoices
165
- # We need to return a Prediction for the Evaluate function later on
166
- return Prediction(candidate_invoice_numbers=potential_invoices)
167
 
168
  class FindTotalAmountColumns(dspy.Signature):
169
  """Given an input remittance letter, return a list of column header names that may contain the total payment amount."""
@@ -203,7 +249,7 @@ class RemittanceLetterTotalAmount(dspy.Module):
203
  # Predict column headers (returns a Prediction with a CSV string in "column_header_names")
204
  predict_column_headers = self.find_total_amount_header(file_content=file_content)
205
  # Parse CSV into a list
206
- potential_total_amount_column_headers = parse_CSV_string(predict_column_headers.total_column_header_names)
207
 
208
  potential_total_amounts = []
209
 
@@ -225,6 +271,8 @@ def poc_production_pipeline_without_verification(file_content):
225
  invoice_list_baseline = InvoiceList()
226
  candidate_invoices = invoice_list_baseline(file_content=file_content).candidate_invoice_numbers
227
 
 
 
228
  # Get total amount candidates
229
  total_amount_baseline = RemittanceLetterTotalAmount()
230
 
 
36
 
37
  # Utils
38
  def parse_CSV_string(csv_string):
39
+ # Parses a CSV string into a list
40
+ return list(map(str.strip, csv_string.split(',')))
41
+
42
+
43
+ def parse_CSV_string_to_unique(csv_string):
44
  # Parses a CSV string into a unique list
45
  return list(set(map(str.lower, map(str.strip, csv_string.split(',')))))
46
 
 
48
  # Parses a list of CSV strings with invoice numbers into a list of lists
49
  parsed_csv_list = []
50
  for csv_string in list_of_csv_strings:
51
+ parsed_csv_list.append(parse_CSV_string_to_unique(csv_string))
52
  return parsed_csv_list
53
 
54
+ def parse_column_names(s):
55
+ """
56
+ Parse a comma-separated list of column names from a string.
57
+ Removes the prefix string before splitting the string.
58
+ Args:
59
+ s: raw response from the model, comma-separated list of column names (string)
60
+ Returns:
61
+ list of column names (list of strings)
62
+ """
63
+ prefix = 'Column Header Names:'
64
+ prefix_length = len(prefix)
65
+ # r_index = s.rfind(prefix)
66
+ # s = s[r_index+prefix_length:] if r_index != -1 else s
67
+ if s.strip().lower().startswith(prefix.lower()):
68
+ s = s[prefix_length:]
69
+ return list(map(str.strip,s.split(',')))
70
+
71
+ def remove_duplicate_lists(lists):
72
+ """
73
+ Remove duplicate lists from a list of lists.
74
+ Args:
75
+ lists:
76
+ a list of lists of strings
77
+ Returns:
78
+ a list of lists of strings, where each list is unique
79
+ """
80
+ seen = set()
81
+ unique_lists = []
82
+
83
+ for lst in lists:
84
+ sorted_list = tuple(sorted(lst))
85
+ if sorted_list not in seen:
86
+ seen.add(sorted_list)
87
+ unique_lists.append(lst)
88
+
89
+ return unique_lists
90
+
91
+
92
  def parse_invoice_number(s):
93
  # Return the invoice number in Siemens' format if found, otherwise just return the string
94
  rp = r'^\s*?([\S\d]+\d{6})'
 
159
  "invoice numbers")
160
 
161
  class InvoiceColumnHeaders(dspy.Module):
162
+ """
163
+ Predict the column headers containing invoice numbers from the remittance letter.
164
+ Attributes:
165
+ response_parser: a function that takes a string and returns a list of strings.
166
+ """
167
+ def __init__(self, response_parser=parse_CSV_string):
168
  super().__init__()
169
+ self.response_parser = response_parser
170
+ self.potential_invoice_column_headers = dspy.Predict(FindInvoiceNumberColumns)
 
171
 
172
  def forward(self, file_content):
173
  prediction = self.potential_invoice_column_headers(content=file_content)
174
+ # Remove duplicates from the prediction
175
+ unique_headers = list(set(self.response_parser(prediction.column_header_names)))
176
+ # Create a new Prediction object with the unique headers
177
+ return Prediction(column_header_names=unique_headers)
 
 
 
178
 
179
  class FindInvoiceList(dspy.Signature):
180
  """Given an input remittance letter and a column header name output a comma-separated list of all invoice numbers """\
 
184
  candidate_invoice_numbers = dspy.OutputField(desc="comma-separated list of invoice numbers")
185
 
186
  class InvoiceList(dspy.Module):
187
+ """
188
+ Retrieves a list of list of potential invoice numbers from a remittance letter.
189
+ Attributes:
190
+ response_parser: A function that takes a string and returns a list of invoice numbers.
191
+ Returns:
192
+ A Prediction object with the following fields:
193
+ candidate_invoice_numbers: A list of lists of invoice numbers.
194
+ """
195
+ def __init__(self, response_parser=parse_CSV_string_to_unique):
196
  super().__init__()
197
+ self.response_parser = response_parser
198
+ self.find_invoice_headers = InvoiceColumnHeaders(response_parser=parse_column_names) # here we could load a compiled program also
199
  self.find_invoice_numbers = dspy.Predict(FindInvoiceList)
200
 
201
  def forward(self, file_content):
 
202
  predict_column_headers = self.find_invoice_headers(file_content=file_content)
203
+ potential_invoice_column_headers = predict_column_headers.column_header_names
 
 
 
204
 
205
+ candidates = []
206
  for header in potential_invoice_column_headers:
207
  prediction = self.find_invoice_numbers(content=file_content, invoice_column_header=header)
208
+ invoice_number_list = self.response_parser(prediction.candidate_invoice_numbers)
209
+ candidates.append(invoice_number_list)
210
  # Remove duplicates
211
+ candidates = remove_duplicate_lists(candidates)
212
+ return Prediction(candidate_invoice_numbers=candidates)
 
 
 
 
213
 
214
  class FindTotalAmountColumns(dspy.Signature):
215
  """Given an input remittance letter, return a list of column header names that may contain the total payment amount."""
 
249
  # Predict column headers (returns a Prediction with a CSV string in "column_header_names")
250
  predict_column_headers = self.find_total_amount_header(file_content=file_content)
251
  # Parse CSV into a list
252
+ potential_total_amount_column_headers = parse_CSV_string_to_unique(predict_column_headers.total_column_header_names)
253
 
254
  potential_total_amounts = []
255
 
 
271
  invoice_list_baseline = InvoiceList()
272
  candidate_invoices = invoice_list_baseline(file_content=file_content).candidate_invoice_numbers
273
 
274
+ candidate_invoices = [",".join(lst) for lst in candidate_invoices]
275
+
276
  # Get total amount candidates
277
  total_amount_baseline = RemittanceLetterTotalAmount()
278