Spaces:

Saltech
/

remittance-processing

Sleeping

App Files Files Community

Alejandro-STC commited on Jul 22, 2024

Commit

6f94dc6

verified ·

1 Parent(s): 2417224

Update comments

Browse files

Files changed (1) hide show

app.py +5 -15

app.py CHANGED Viewed

@@ -70,7 +70,7 @@ def parse_column_names(s):
 def remove_duplicate_lists(lists):
     """
-        Remove duplicate lists from a list of lists.
     Args:
       lists:
         a list of lists of strings
@@ -90,7 +90,7 @@ def remove_duplicate_lists(lists):
 def parse_invoice_number(s):
-  # Return the invoice number in Siemens' format if found, otherwise just return the string
   rp = r'^\s*?([\S\d]+\d{6})'
   m = re.search(rp, s)
   return m.group(1) if m else s
@@ -134,11 +134,11 @@ def format_text_decimal(text_decimal):
 # PDF handling
 def extract_text_using_pdfplumber(file_path):
-  # TODO: add check for text vs images padf
   with pdfplumber.open(file_path) as pdf:
     extracted_text = ''
     for i, page in enumerate(pdf.pages):
-      # Remove duplicate characters from the page.
       deduped_page = page.dedupe_chars(tolerance=1)
       extracted_text += deduped_page.extract_text()
   return extracted_text
@@ -236,16 +236,10 @@ class FindTotalAmount(dspy.Signature):
 class RemittanceLetterTotalAmount(dspy.Module):
   def __init__(self):
     super().__init__()
-    # self.find_invoice_list = InvoiceList()
     self.find_total_amount_header = TotalAmountColumnHeaders()
     self.find_total_amount = dspy.Predict(FindTotalAmount)
   def forward(self, file_content):
-    # Predict invoice list - we could do this here, but let's just call the 2 modules from a function instead
-    # if we called the invoice list prediction here, we should return an object with both the potential total amounts
-    # and the potential invoice lists
-    # predict_invoice_list = self.find_invoice_list(file_content=file_content)
     # Predict column headers (returns a Prediction with a CSV string in "column_header_names")
     predict_column_headers = self.find_total_amount_header(file_content=file_content)
     # Parse CSV into a list
@@ -259,14 +253,11 @@ class RemittanceLetterTotalAmount(dspy.Module):
     # Remove duplicates
     potential_total_amounts = list(set(potential_total_amounts))
-    return Prediction(candidate_total_amounts=potential_total_amounts) # I could just return "prediction" also (references to candidate_total_amounts should change then)
 # Pipeline
 def poc_production_pipeline_without_verification(file_content):
-  # TODO: place this in a module - init allows to pass a compiled module and forward handles the data:
-  # so we can evaluate the pipeline (check if any tuple matches the verifier)
   # Get invoice candidates
   invoice_list_baseline = InvoiceList()
   candidate_invoices = invoice_list_baseline(file_content=file_content).candidate_invoice_numbers
@@ -296,7 +287,6 @@ def poc_production_pipeline_without_verification(file_content):
 def poc_production_pipeline_without_verification_from_PDF(file_path):
   file_content = extract_text_using_pdfplumber(file_path)
-  # return str(poc_production_pipeline_without_verification(file_content))
   return poc_production_pipeline_without_verification(file_content)

 def remove_duplicate_lists(lists):
     """
+    Remove duplicate lists from a list of lists.
     Args:
       lists:
         a list of lists of strings
 def parse_invoice_number(s):
+  # Return the invoice number in a specific format if found, otherwise just return the input string
   rp = r'^\s*?([\S\d]+\d{6})'
   m = re.search(rp, s)
   return m.group(1) if m else s
 # PDF handling
 def extract_text_using_pdfplumber(file_path):
+  # TODO: add check for text vs image PDF
   with pdfplumber.open(file_path) as pdf:
     extracted_text = ''
     for i, page in enumerate(pdf.pages):
+      # Remove duplicate characters from the page
       deduped_page = page.dedupe_chars(tolerance=1)
       extracted_text += deduped_page.extract_text()
   return extracted_text
 class RemittanceLetterTotalAmount(dspy.Module):
   def __init__(self):
     super().__init__()
     self.find_total_amount_header = TotalAmountColumnHeaders()
     self.find_total_amount = dspy.Predict(FindTotalAmount)
   def forward(self, file_content):
     # Predict column headers (returns a Prediction with a CSV string in "column_header_names")
     predict_column_headers = self.find_total_amount_header(file_content=file_content)
     # Parse CSV into a list
     # Remove duplicates
     potential_total_amounts = list(set(potential_total_amounts))
+    return Prediction(candidate_total_amounts=potential_total_amounts)
 # Pipeline
 def poc_production_pipeline_without_verification(file_content):
   # Get invoice candidates
   invoice_list_baseline = InvoiceList()
   candidate_invoices = invoice_list_baseline(file_content=file_content).candidate_invoice_numbers
 def poc_production_pipeline_without_verification_from_PDF(file_path):
   file_content = extract_text_using_pdfplumber(file_path)
   return poc_production_pipeline_without_verification(file_content)