Alejandro-STC commited on
Commit
6f94dc6
·
verified ·
1 Parent(s): 2417224

Update comments

Browse files
Files changed (1) hide show
  1. app.py +5 -15
app.py CHANGED
@@ -70,7 +70,7 @@ def parse_column_names(s):
70
 
71
  def remove_duplicate_lists(lists):
72
  """
73
- Remove duplicate lists from a list of lists.
74
  Args:
75
  lists:
76
  a list of lists of strings
@@ -90,7 +90,7 @@ def remove_duplicate_lists(lists):
90
 
91
 
92
  def parse_invoice_number(s):
93
- # Return the invoice number in Siemens' format if found, otherwise just return the string
94
  rp = r'^\s*?([\S\d]+\d{6})'
95
  m = re.search(rp, s)
96
  return m.group(1) if m else s
@@ -134,11 +134,11 @@ def format_text_decimal(text_decimal):
134
 
135
  # PDF handling
136
  def extract_text_using_pdfplumber(file_path):
137
- # TODO: add check for text vs images padf
138
  with pdfplumber.open(file_path) as pdf:
139
  extracted_text = ''
140
  for i, page in enumerate(pdf.pages):
141
- # Remove duplicate characters from the page.
142
  deduped_page = page.dedupe_chars(tolerance=1)
143
  extracted_text += deduped_page.extract_text()
144
  return extracted_text
@@ -236,16 +236,10 @@ class FindTotalAmount(dspy.Signature):
236
  class RemittanceLetterTotalAmount(dspy.Module):
237
  def __init__(self):
238
  super().__init__()
239
- # self.find_invoice_list = InvoiceList()
240
  self.find_total_amount_header = TotalAmountColumnHeaders()
241
  self.find_total_amount = dspy.Predict(FindTotalAmount)
242
 
243
  def forward(self, file_content):
244
- # Predict invoice list - we could do this here, but let's just call the 2 modules from a function instead
245
- # if we called the invoice list prediction here, we should return an object with both the potential total amounts
246
- # and the potential invoice lists
247
- # predict_invoice_list = self.find_invoice_list(file_content=file_content)
248
-
249
  # Predict column headers (returns a Prediction with a CSV string in "column_header_names")
250
  predict_column_headers = self.find_total_amount_header(file_content=file_content)
251
  # Parse CSV into a list
@@ -259,14 +253,11 @@ class RemittanceLetterTotalAmount(dspy.Module):
259
 
260
  # Remove duplicates
261
  potential_total_amounts = list(set(potential_total_amounts))
262
- return Prediction(candidate_total_amounts=potential_total_amounts) # I could just return "prediction" also (references to candidate_total_amounts should change then)
263
 
264
 
265
  # Pipeline
266
  def poc_production_pipeline_without_verification(file_content):
267
- # TODO: place this in a module - init allows to pass a compiled module and forward handles the data:
268
- # so we can evaluate the pipeline (check if any tuple matches the verifier)
269
-
270
  # Get invoice candidates
271
  invoice_list_baseline = InvoiceList()
272
  candidate_invoices = invoice_list_baseline(file_content=file_content).candidate_invoice_numbers
@@ -296,7 +287,6 @@ def poc_production_pipeline_without_verification(file_content):
296
 
297
  def poc_production_pipeline_without_verification_from_PDF(file_path):
298
  file_content = extract_text_using_pdfplumber(file_path)
299
- # return str(poc_production_pipeline_without_verification(file_content))
300
  return poc_production_pipeline_without_verification(file_content)
301
 
302
 
 
70
 
71
  def remove_duplicate_lists(lists):
72
  """
73
+ Remove duplicate lists from a list of lists.
74
  Args:
75
  lists:
76
  a list of lists of strings
 
90
 
91
 
92
  def parse_invoice_number(s):
93
+ # Return the invoice number in a specific format if found, otherwise just return the input string
94
  rp = r'^\s*?([\S\d]+\d{6})'
95
  m = re.search(rp, s)
96
  return m.group(1) if m else s
 
134
 
135
  # PDF handling
136
  def extract_text_using_pdfplumber(file_path):
137
+ # TODO: add check for text vs image PDF
138
  with pdfplumber.open(file_path) as pdf:
139
  extracted_text = ''
140
  for i, page in enumerate(pdf.pages):
141
+ # Remove duplicate characters from the page
142
  deduped_page = page.dedupe_chars(tolerance=1)
143
  extracted_text += deduped_page.extract_text()
144
  return extracted_text
 
236
  class RemittanceLetterTotalAmount(dspy.Module):
237
  def __init__(self):
238
  super().__init__()
 
239
  self.find_total_amount_header = TotalAmountColumnHeaders()
240
  self.find_total_amount = dspy.Predict(FindTotalAmount)
241
 
242
  def forward(self, file_content):
 
 
 
 
 
243
  # Predict column headers (returns a Prediction with a CSV string in "column_header_names")
244
  predict_column_headers = self.find_total_amount_header(file_content=file_content)
245
  # Parse CSV into a list
 
253
 
254
  # Remove duplicates
255
  potential_total_amounts = list(set(potential_total_amounts))
256
+ return Prediction(candidate_total_amounts=potential_total_amounts)
257
 
258
 
259
  # Pipeline
260
  def poc_production_pipeline_without_verification(file_content):
 
 
 
261
  # Get invoice candidates
262
  invoice_list_baseline = InvoiceList()
263
  candidate_invoices = invoice_list_baseline(file_content=file_content).candidate_invoice_numbers
 
287
 
288
  def poc_production_pipeline_without_verification_from_PDF(file_path):
289
  file_content = extract_text_using_pdfplumber(file_path)
 
290
  return poc_production_pipeline_without_verification(file_content)
291
 
292