Spaces:
Sleeping
Sleeping
Refactored pipeline, added doc
Browse files
app.py
CHANGED
@@ -36,6 +36,11 @@ dspy.settings.configure(lm=model)
|
|
36 |
|
37 |
# Utils
|
38 |
def parse_CSV_string(csv_string):
|
|
|
|
|
|
|
|
|
|
|
39 |
# Parses a CSV string into a unique list
|
40 |
return list(set(map(str.lower, map(str.strip, csv_string.split(',')))))
|
41 |
|
@@ -43,9 +48,47 @@ def parse_list_of_CSV_strings(list_of_csv_strings):
|
|
43 |
# Parses a list of CSV strings with invoice numbers into a list of lists
|
44 |
parsed_csv_list = []
|
45 |
for csv_string in list_of_csv_strings:
|
46 |
-
parsed_csv_list.append(
|
47 |
return parsed_csv_list
|
48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
def parse_invoice_number(s):
|
50 |
# Return the invoice number in Siemens' format if found, otherwise just return the string
|
51 |
rp = r'^\s*?([\S\d]+\d{6})'
|
@@ -116,21 +159,22 @@ class FindInvoiceNumberColumns(dspy.Signature):
|
|
116 |
"invoice numbers")
|
117 |
|
118 |
class InvoiceColumnHeaders(dspy.Module):
|
119 |
-
|
|
|
|
|
|
|
|
|
|
|
120 |
super().__init__()
|
121 |
-
|
122 |
-
|
123 |
-
self.potential_invoice_column_headers = dspy.Predict(FindInvoiceNumberColumns) # Ervin suggests Predict
|
124 |
|
125 |
def forward(self, file_content):
|
126 |
prediction = self.potential_invoice_column_headers(content=file_content)
|
127 |
-
#
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
# This creates a new Prediction object adding the File Content
|
132 |
-
# return Prediction(content=file_content, column_header_names=prediction.column_header_names, rationale=prediction.rationale)
|
133 |
-
# Creating a new Prediction object with extra data can be useful if we need more data for the verification
|
134 |
|
135 |
class FindInvoiceList(dspy.Signature):
|
136 |
"""Given an input remittance letter and a column header name output a comma-separated list of all invoice numbers """\
|
@@ -140,30 +184,32 @@ class FindInvoiceList(dspy.Signature):
|
|
140 |
candidate_invoice_numbers = dspy.OutputField(desc="comma-separated list of invoice numbers")
|
141 |
|
142 |
class InvoiceList(dspy.Module):
|
143 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
super().__init__()
|
145 |
-
self.
|
|
|
146 |
self.find_invoice_numbers = dspy.Predict(FindInvoiceList)
|
147 |
|
148 |
def forward(self, file_content):
|
149 |
-
# Predict column headers (returns a Prediction with a CSV string in "column_header_names")
|
150 |
predict_column_headers = self.find_invoice_headers(file_content=file_content)
|
151 |
-
|
152 |
-
potential_invoice_column_headers = parse_CSV_string(predict_column_headers.column_header_names)
|
153 |
-
|
154 |
-
potential_invoices = []
|
155 |
|
|
|
156 |
for header in potential_invoice_column_headers:
|
157 |
prediction = self.find_invoice_numbers(content=file_content, invoice_column_header=header)
|
158 |
-
|
159 |
-
|
160 |
# Remove duplicates
|
161 |
-
|
162 |
-
|
163 |
-
# return Prediction(candidate_invoice_numbers=candidates, column_header_names=col_names)
|
164 |
-
# return potential_invoices
|
165 |
-
# We need to return a Prediction for the Evaluate function later on
|
166 |
-
return Prediction(candidate_invoice_numbers=potential_invoices)
|
167 |
|
168 |
class FindTotalAmountColumns(dspy.Signature):
|
169 |
"""Given an input remittance letter, return a list of column header names that may contain the total payment amount."""
|
@@ -203,7 +249,7 @@ class RemittanceLetterTotalAmount(dspy.Module):
|
|
203 |
# Predict column headers (returns a Prediction with a CSV string in "column_header_names")
|
204 |
predict_column_headers = self.find_total_amount_header(file_content=file_content)
|
205 |
# Parse CSV into a list
|
206 |
-
potential_total_amount_column_headers =
|
207 |
|
208 |
potential_total_amounts = []
|
209 |
|
@@ -225,6 +271,8 @@ def poc_production_pipeline_without_verification(file_content):
|
|
225 |
invoice_list_baseline = InvoiceList()
|
226 |
candidate_invoices = invoice_list_baseline(file_content=file_content).candidate_invoice_numbers
|
227 |
|
|
|
|
|
228 |
# Get total amount candidates
|
229 |
total_amount_baseline = RemittanceLetterTotalAmount()
|
230 |
|
|
|
36 |
|
37 |
# Utils
|
38 |
def parse_CSV_string(csv_string):
|
39 |
+
# Parses a CSV string into a list
|
40 |
+
return list(map(str.strip, csv_string.split(',')))
|
41 |
+
|
42 |
+
|
43 |
+
def parse_CSV_string_to_unique(csv_string):
|
44 |
# Parses a CSV string into a unique list
|
45 |
return list(set(map(str.lower, map(str.strip, csv_string.split(',')))))
|
46 |
|
|
|
48 |
# Parses a list of CSV strings with invoice numbers into a list of lists
|
49 |
parsed_csv_list = []
|
50 |
for csv_string in list_of_csv_strings:
|
51 |
+
parsed_csv_list.append(parse_CSV_string_to_unique(csv_string))
|
52 |
return parsed_csv_list
|
53 |
|
54 |
+
def parse_column_names(s):
|
55 |
+
"""
|
56 |
+
Parse a comma-separated list of column names from a string.
|
57 |
+
Removes the prefix string before splitting the string.
|
58 |
+
Args:
|
59 |
+
s: raw response from the model, comma-separated list of column names (string)
|
60 |
+
Returns:
|
61 |
+
list of column names (list of strings)
|
62 |
+
"""
|
63 |
+
prefix = 'Column Header Names:'
|
64 |
+
prefix_length = len(prefix)
|
65 |
+
# r_index = s.rfind(prefix)
|
66 |
+
# s = s[r_index+prefix_length:] if r_index != -1 else s
|
67 |
+
if s.strip().lower().startswith(prefix.lower()):
|
68 |
+
s = s[prefix_length:]
|
69 |
+
return list(map(str.strip,s.split(',')))
|
70 |
+
|
71 |
+
def remove_duplicate_lists(lists):
|
72 |
+
"""
|
73 |
+
Remove duplicate lists from a list of lists.
|
74 |
+
Args:
|
75 |
+
lists:
|
76 |
+
a list of lists of strings
|
77 |
+
Returns:
|
78 |
+
a list of lists of strings, where each list is unique
|
79 |
+
"""
|
80 |
+
seen = set()
|
81 |
+
unique_lists = []
|
82 |
+
|
83 |
+
for lst in lists:
|
84 |
+
sorted_list = tuple(sorted(lst))
|
85 |
+
if sorted_list not in seen:
|
86 |
+
seen.add(sorted_list)
|
87 |
+
unique_lists.append(lst)
|
88 |
+
|
89 |
+
return unique_lists
|
90 |
+
|
91 |
+
|
92 |
def parse_invoice_number(s):
|
93 |
# Return the invoice number in Siemens' format if found, otherwise just return the string
|
94 |
rp = r'^\s*?([\S\d]+\d{6})'
|
|
|
159 |
"invoice numbers")
|
160 |
|
161 |
class InvoiceColumnHeaders(dspy.Module):
|
162 |
+
"""
|
163 |
+
Predict the column headers containing invoice numbers from the remittance letter.
|
164 |
+
Attributes:
|
165 |
+
response_parser: a function that takes a string and returns a list of strings.
|
166 |
+
"""
|
167 |
+
def __init__(self, response_parser=parse_CSV_string):
|
168 |
super().__init__()
|
169 |
+
self.response_parser = response_parser
|
170 |
+
self.potential_invoice_column_headers = dspy.Predict(FindInvoiceNumberColumns)
|
|
|
171 |
|
172 |
def forward(self, file_content):
|
173 |
prediction = self.potential_invoice_column_headers(content=file_content)
|
174 |
+
# Remove duplicates from the prediction
|
175 |
+
unique_headers = list(set(self.response_parser(prediction.column_header_names)))
|
176 |
+
# Create a new Prediction object with the unique headers
|
177 |
+
return Prediction(column_header_names=unique_headers)
|
|
|
|
|
|
|
178 |
|
179 |
class FindInvoiceList(dspy.Signature):
|
180 |
"""Given an input remittance letter and a column header name output a comma-separated list of all invoice numbers """\
|
|
|
184 |
candidate_invoice_numbers = dspy.OutputField(desc="comma-separated list of invoice numbers")
|
185 |
|
186 |
class InvoiceList(dspy.Module):
|
187 |
+
"""
|
188 |
+
Retrieves a list of list of potential invoice numbers from a remittance letter.
|
189 |
+
Attributes:
|
190 |
+
response_parser: A function that takes a string and returns a list of invoice numbers.
|
191 |
+
Returns:
|
192 |
+
A Prediction object with the following fields:
|
193 |
+
candidate_invoice_numbers: A list of lists of invoice numbers.
|
194 |
+
"""
|
195 |
+
def __init__(self, response_parser=parse_CSV_string_to_unique):
|
196 |
super().__init__()
|
197 |
+
self.response_parser = response_parser
|
198 |
+
self.find_invoice_headers = InvoiceColumnHeaders(response_parser=parse_column_names) # here we could load a compiled program also
|
199 |
self.find_invoice_numbers = dspy.Predict(FindInvoiceList)
|
200 |
|
201 |
def forward(self, file_content):
|
|
|
202 |
predict_column_headers = self.find_invoice_headers(file_content=file_content)
|
203 |
+
potential_invoice_column_headers = predict_column_headers.column_header_names
|
|
|
|
|
|
|
204 |
|
205 |
+
candidates = []
|
206 |
for header in potential_invoice_column_headers:
|
207 |
prediction = self.find_invoice_numbers(content=file_content, invoice_column_header=header)
|
208 |
+
invoice_number_list = self.response_parser(prediction.candidate_invoice_numbers)
|
209 |
+
candidates.append(invoice_number_list)
|
210 |
# Remove duplicates
|
211 |
+
candidates = remove_duplicate_lists(candidates)
|
212 |
+
return Prediction(candidate_invoice_numbers=candidates)
|
|
|
|
|
|
|
|
|
213 |
|
214 |
class FindTotalAmountColumns(dspy.Signature):
|
215 |
"""Given an input remittance letter, return a list of column header names that may contain the total payment amount."""
|
|
|
249 |
# Predict column headers (returns a Prediction with a CSV string in "column_header_names")
|
250 |
predict_column_headers = self.find_total_amount_header(file_content=file_content)
|
251 |
# Parse CSV into a list
|
252 |
+
potential_total_amount_column_headers = parse_CSV_string_to_unique(predict_column_headers.total_column_header_names)
|
253 |
|
254 |
potential_total_amounts = []
|
255 |
|
|
|
271 |
invoice_list_baseline = InvoiceList()
|
272 |
candidate_invoices = invoice_list_baseline(file_content=file_content).candidate_invoice_numbers
|
273 |
|
274 |
+
candidate_invoices = [",".join(lst) for lst in candidate_invoices]
|
275 |
+
|
276 |
# Get total amount candidates
|
277 |
total_amount_baseline = RemittanceLetterTotalAmount()
|
278 |
|