import os #DSPY import dspy from dspy import Prediction from dspy.evaluate import Evaluate from dspy import Prediction from dspy.teleprompt import BootstrapFewShot from dspy.teleprompt import BootstrapFewShotWithRandomSearch # Data handling # import pandas as pd # Calculations and formatting import re from decimal import Decimal # UI import gradio as gr from gradio_pdf import PDF # PDF handling import pdfplumber pdf_examples_dir = './pdfexamples/' model = dspy.OpenAI( model='gpt-3.5-turbo-0125', api_key=os.getenv('OPENAI_PROJECT_KEY'), max_tokens=2000, temperature=0.01) dspy.settings.configure(lm=model) # Utils def parse_CSV_string(csv_string): # Parses a CSV string into a unique list return list(set(map(str.lower, map(str.strip, csv_string.split(','))))) def parse_list_of_CSV_strings(list_of_csv_strings): # Parses a list of CSV strings with invoice numbers into a list of lists parsed_csv_list = [] for csv_string in list_of_csv_strings: parsed_csv_list.append(parse_CSV_string(csv_string)) return parsed_csv_list def parse_invoice_number(s): # Return the invoice number in Siemens' format if found, otherwise just return the string rp = r'^\s*?([\S\d]+\d{6})' m = re.search(rp, s) return m.group(1) if m else s def standardize_number(s): # Find the last occurrence of a comma or period last_separator_index = max(s.rfind(','), s.rfind('.')) if last_separator_index != -1: # Split the string into two parts before_separator = s[:last_separator_index] after_separator = s[last_separator_index+1:] # Clean the first part of any commas, periods, or whitespace before_separator_cleaned = re.sub(r'[.,\s]', '', before_separator) # Ensure the decimal part starts with a period, even if it was a comma standardized_s = before_separator_cleaned + '.' + after_separator else: # If there's no separator, just remove commas, periods, or whitespace standardized_s = re.sub(r'[.,\s]', '', s) return standardized_s def remove_chars_after_last_digit(s): # Remove any non-digit characters following the last digit in the string return re.sub(r'(?<=\d)[^\d]*$', '', s) def clean_text(s): # This pattern looks for: # - Optional non-digit or non-negative sign characters followed by whitespace (if any) # - Followed by any characters until a digit is found in the word # It then replaces this matched portion with the remaining part of the word from the first digit # cleaned_s = re.sub(r'\S*?\s*?(\S*\d\S*)', r'\1', s) cleaned_s = re.sub(r'[^\d-]*\s?(\S*\d\S*)', r'\1', s) return cleaned_s def format_text_decimal(text_decimal): # Run functions to format a text decimal return clean_text(remove_chars_after_last_digit(standardize_number(text_decimal.strip().lower()))) # PDF handling def extract_text_using_pdfplumber(file_path): # TODO: add check for text vs images padf with pdfplumber.open(file_path) as pdf: extracted_text = '' for i, page in enumerate(pdf.pages): # Remove duplicate characters from the page. deduped_page = page.dedupe_chars(tolerance=1) extracted_text += deduped_page.extract_text() return extracted_text def get_PDF_examples(directory): example_pdf_files = [] for filename in os.listdir(directory): if filename.endswith('.pdf'): example_pdf_files.append(os.path.join(directory, filename)) return example_pdf_files # Signatures and Models class FindInvoiceNumberColumns(dspy.Signature): """Given an input remittance letter, return a list of column header names that may contain invoice numbers.""" content = dspy.InputField(desc="remittance letter", format=lambda s:s) # s:s so it doesn't skip the new lines column_header_names = dspy.OutputField(desc="comma-separated list of column header names that may contain "\ "invoice numbers") class InvoiceColumnHeaders(dspy.Module): def __init__(self): super().__init__() # self.potential_invoice_column_headers = dspy.ChainOfThought(FindInvoiceNumberColumns) self.potential_invoice_column_headers = dspy.Predict(FindInvoiceNumberColumns) # Ervin suggests Predict def forward(self, file_content): prediction = self.potential_invoice_column_headers(content=file_content) # NOTE: Instead of a prediction we could return a simple list (for consistency with my other Modules) # or even a parsed list (not CSV) return prediction # This creates a new Prediction object adding the File Content # return Prediction(content=file_content, column_header_names=prediction.column_header_names, rationale=prediction.rationale) # Creating a new Prediction object with extra data can be useful if we need more data for the verification class FindInvoiceList(dspy.Signature): """Given an input remittance letter and a column header name output a comma-separated list of all invoice numbers """\ """that belong to that column.""" content = dspy.InputField(desc="remittance letter", format=lambda s:s) # s:s so it doesn't skip the new lines invoice_column_header = dspy.InputField(desc="invoice column header name") candidate_invoice_numbers = dspy.OutputField(desc="comma-separated list of invoice numbers") class InvoiceList(dspy.Module): def __init__(self): super().__init__() self.find_invoice_headers = InvoiceColumnHeaders() # here we could load a compiled program also self.find_invoice_numbers = dspy.Predict(FindInvoiceList) def forward(self, file_content): # Predict column headers (returns a Prediction with a CSV string in "column_header_names") predict_column_headers = self.find_invoice_headers(file_content=file_content) # Parse CSV into a list potential_invoice_column_headers = parse_CSV_string(predict_column_headers.column_header_names) potential_invoices = [] for header in potential_invoice_column_headers: prediction = self.find_invoice_numbers(content=file_content, invoice_column_header=header) potential_invoices.append(prediction.candidate_invoice_numbers) # Remove duplicates # potential_invoices = list(set(potential_invoices)) potential_invoices = parse_list_of_CSV_strings(potential_invoices) # TODO: remove duplicated lists # return Prediction(candidate_invoice_numbers=candidates, column_header_names=col_names) # return potential_invoices # We need to return a Prediction for the Evaluate function later on return Prediction(candidate_invoice_numbers=potential_invoices) class FindTotalAmountColumns(dspy.Signature): """Given an input remittance letter, return a list of column header names that may contain the total payment amount.""" content = dspy.InputField(desc="remittance letter", format=lambda s:s) # s:s so it doesn't skip the new lines total_column_header_names = dspy.OutputField(desc="comma-separated list of column header names that may contain "\ "the remittance letter total payment amount") class TotalAmountColumnHeaders(dspy.Module): def __init__(self): super().__init__() self.potential_total_amount_column_headers = dspy.Predict(FindTotalAmountColumns) def forward(self, file_content): prediction = self.potential_total_amount_column_headers(content=file_content) return prediction class FindTotalAmount(dspy.Signature): """Given an input remittance letter and a column header name output the total payment amount """\ """that belongs to that column.""" content = dspy.InputField(desc="remittance letter", format=lambda s:s) # s:s so it doesn't skip the new lines total_amount_column_header = dspy.InputField(desc="total amount header name") total_amount = dspy.OutputField(desc="total payment amount") class RemittanceLetterTotalAmount(dspy.Module): def __init__(self): super().__init__() # self.find_invoice_list = InvoiceList() self.find_total_amount_header = TotalAmountColumnHeaders() self.find_total_amount = dspy.Predict(FindTotalAmount) def forward(self, file_content): # Predict invoice list - we could do this here, but let's just call the 2 modules from a function instead # if we called the invoice list prediction here, we should return an object with both the potential total amounts # and the potential invoice lists # predict_invoice_list = self.find_invoice_list(file_content=file_content) # Predict column headers (returns a Prediction with a CSV string in "column_header_names") predict_column_headers = self.find_total_amount_header(file_content=file_content) # Parse CSV into a list potential_total_amount_column_headers = parse_CSV_string(predict_column_headers.total_column_header_names) potential_total_amounts = [] for header in potential_total_amount_column_headers: prediction = self.find_total_amount(content=file_content, total_amount_column_header=header) potential_total_amounts.append(prediction.total_amount) # Remove duplicates potential_total_amounts = list(set(potential_total_amounts)) return Prediction(candidate_total_amounts=potential_total_amounts) # I could just return "prediction" also (references to candidate_total_amounts should change then) # Pipeline def poc_production_pipeline_without_verification(file_content): # TODO: place this in a module - init allows to pass a compiled module and forward handles the data: # so we can evaluate the pipeline (check if any tuple matches the verifier) # Get invoice candidates invoice_list_baseline = InvoiceList() candidate_invoices = invoice_list_baseline(file_content=file_content).candidate_invoice_numbers # Get total amount candidates total_amount_baseline = RemittanceLetterTotalAmount() # Format all decimals candidate_total_amounts = list(map(format_text_decimal, total_amount_baseline(file_content=file_content).candidate_total_amounts)) # For UI visualisation purposes, create a list of tuples where the second tuple value is empty candidate_invoices_for_UI = [] candidate_total_amounts_for_UI = [] for candidate in candidate_invoices: candidate_invoices_for_UI.append((candidate,)) for candidate in candidate_total_amounts: candidate_total_amounts_for_UI.append((candidate,)) return candidate_invoices_for_UI, candidate_total_amounts_for_UI def poc_production_pipeline_without_verification_from_PDF(file_path): file_content = extract_text_using_pdfplumber(file_path) # return str(poc_production_pipeline_without_verification(file_content)) return poc_production_pipeline_without_verification(file_content) # Main app fake_PDF_examples = get_PDF_examples(pdf_examples_dir) remittance_letter_demo_without_verification_from_PDF = gr.Interface( poc_production_pipeline_without_verification_from_PDF, [PDF(label="Remittance advice", height=1000)], [ gr.Dataframe(col_count=(1, 'fixed'), label="", headers=["Retrieved invoice proposals"], wrap=True), gr.Dataframe(col_count=(1, 'fixed'), label="", headers=["Retrieved total amount proposals"], wrap=True) ], examples=fake_PDF_examples, allow_flagging='never' ) remittance_letter_demo_without_verification_from_PDF.launch()