Spaces:
Sleeping
Sleeping
Update comments
Browse files
app.py
CHANGED
@@ -70,7 +70,7 @@ def parse_column_names(s):
|
|
70 |
|
71 |
def remove_duplicate_lists(lists):
|
72 |
"""
|
73 |
-
|
74 |
Args:
|
75 |
lists:
|
76 |
a list of lists of strings
|
@@ -90,7 +90,7 @@ def remove_duplicate_lists(lists):
|
|
90 |
|
91 |
|
92 |
def parse_invoice_number(s):
|
93 |
-
# Return the invoice number in
|
94 |
rp = r'^\s*?([\S\d]+\d{6})'
|
95 |
m = re.search(rp, s)
|
96 |
return m.group(1) if m else s
|
@@ -134,11 +134,11 @@ def format_text_decimal(text_decimal):
|
|
134 |
|
135 |
# PDF handling
|
136 |
def extract_text_using_pdfplumber(file_path):
|
137 |
-
# TODO: add check for text vs
|
138 |
with pdfplumber.open(file_path) as pdf:
|
139 |
extracted_text = ''
|
140 |
for i, page in enumerate(pdf.pages):
|
141 |
-
# Remove duplicate characters from the page
|
142 |
deduped_page = page.dedupe_chars(tolerance=1)
|
143 |
extracted_text += deduped_page.extract_text()
|
144 |
return extracted_text
|
@@ -236,16 +236,10 @@ class FindTotalAmount(dspy.Signature):
|
|
236 |
class RemittanceLetterTotalAmount(dspy.Module):
|
237 |
def __init__(self):
|
238 |
super().__init__()
|
239 |
-
# self.find_invoice_list = InvoiceList()
|
240 |
self.find_total_amount_header = TotalAmountColumnHeaders()
|
241 |
self.find_total_amount = dspy.Predict(FindTotalAmount)
|
242 |
|
243 |
def forward(self, file_content):
|
244 |
-
# Predict invoice list - we could do this here, but let's just call the 2 modules from a function instead
|
245 |
-
# if we called the invoice list prediction here, we should return an object with both the potential total amounts
|
246 |
-
# and the potential invoice lists
|
247 |
-
# predict_invoice_list = self.find_invoice_list(file_content=file_content)
|
248 |
-
|
249 |
# Predict column headers (returns a Prediction with a CSV string in "column_header_names")
|
250 |
predict_column_headers = self.find_total_amount_header(file_content=file_content)
|
251 |
# Parse CSV into a list
|
@@ -259,14 +253,11 @@ class RemittanceLetterTotalAmount(dspy.Module):
|
|
259 |
|
260 |
# Remove duplicates
|
261 |
potential_total_amounts = list(set(potential_total_amounts))
|
262 |
-
return Prediction(candidate_total_amounts=potential_total_amounts)
|
263 |
|
264 |
|
265 |
# Pipeline
|
266 |
def poc_production_pipeline_without_verification(file_content):
|
267 |
-
# TODO: place this in a module - init allows to pass a compiled module and forward handles the data:
|
268 |
-
# so we can evaluate the pipeline (check if any tuple matches the verifier)
|
269 |
-
|
270 |
# Get invoice candidates
|
271 |
invoice_list_baseline = InvoiceList()
|
272 |
candidate_invoices = invoice_list_baseline(file_content=file_content).candidate_invoice_numbers
|
@@ -296,7 +287,6 @@ def poc_production_pipeline_without_verification(file_content):
|
|
296 |
|
297 |
def poc_production_pipeline_without_verification_from_PDF(file_path):
|
298 |
file_content = extract_text_using_pdfplumber(file_path)
|
299 |
-
# return str(poc_production_pipeline_without_verification(file_content))
|
300 |
return poc_production_pipeline_without_verification(file_content)
|
301 |
|
302 |
|
|
|
70 |
|
71 |
def remove_duplicate_lists(lists):
|
72 |
"""
|
73 |
+
Remove duplicate lists from a list of lists.
|
74 |
Args:
|
75 |
lists:
|
76 |
a list of lists of strings
|
|
|
90 |
|
91 |
|
92 |
def parse_invoice_number(s):
|
93 |
+
# Return the invoice number in a specific format if found, otherwise just return the input string
|
94 |
rp = r'^\s*?([\S\d]+\d{6})'
|
95 |
m = re.search(rp, s)
|
96 |
return m.group(1) if m else s
|
|
|
134 |
|
135 |
# PDF handling
|
136 |
def extract_text_using_pdfplumber(file_path):
|
137 |
+
# TODO: add check for text vs image PDF
|
138 |
with pdfplumber.open(file_path) as pdf:
|
139 |
extracted_text = ''
|
140 |
for i, page in enumerate(pdf.pages):
|
141 |
+
# Remove duplicate characters from the page
|
142 |
deduped_page = page.dedupe_chars(tolerance=1)
|
143 |
extracted_text += deduped_page.extract_text()
|
144 |
return extracted_text
|
|
|
236 |
class RemittanceLetterTotalAmount(dspy.Module):
|
237 |
def __init__(self):
|
238 |
super().__init__()
|
|
|
239 |
self.find_total_amount_header = TotalAmountColumnHeaders()
|
240 |
self.find_total_amount = dspy.Predict(FindTotalAmount)
|
241 |
|
242 |
def forward(self, file_content):
|
|
|
|
|
|
|
|
|
|
|
243 |
# Predict column headers (returns a Prediction with a CSV string in "column_header_names")
|
244 |
predict_column_headers = self.find_total_amount_header(file_content=file_content)
|
245 |
# Parse CSV into a list
|
|
|
253 |
|
254 |
# Remove duplicates
|
255 |
potential_total_amounts = list(set(potential_total_amounts))
|
256 |
+
return Prediction(candidate_total_amounts=potential_total_amounts)
|
257 |
|
258 |
|
259 |
# Pipeline
|
260 |
def poc_production_pipeline_without_verification(file_content):
|
|
|
|
|
|
|
261 |
# Get invoice candidates
|
262 |
invoice_list_baseline = InvoiceList()
|
263 |
candidate_invoices = invoice_list_baseline(file_content=file_content).candidate_invoice_numbers
|
|
|
287 |
|
288 |
def poc_production_pipeline_without_verification_from_PDF(file_path):
|
289 |
file_content = extract_text_using_pdfplumber(file_path)
|
|
|
290 |
return poc_production_pipeline_without_verification(file_content)
|
291 |
|
292 |
|