Spaces:
Sleeping
Sleeping
File size: 11,306 Bytes
275359b 7698d94 275359b 7698d94 275359b 9aedcaa 275359b 9aedcaa 275359b c535f48 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 |
import os
#DSPY
import dspy
from dspy import Prediction
from dspy.evaluate import Evaluate
from dspy import Prediction
from dspy.teleprompt import BootstrapFewShot
from dspy.teleprompt import BootstrapFewShotWithRandomSearch
# Data handling
# import pandas as pd
# Calculations and formatting
import re
from decimal import Decimal
# UI
import gradio as gr
from gradio_pdf import PDF
# PDF handling
import pdfplumber
pdf_examples_dir = './pdfexamples/'
model = dspy.OpenAI(
model='gpt-3.5-turbo-0125',
api_key=os.getenv('OPENAI_PROJECT_KEY'),
max_tokens=2000,
temperature=0.01)
dspy.settings.configure(lm=model)
# Utils
def parse_CSV_string(csv_string):
# Parses a CSV string into a unique list
return list(set(map(str.lower, map(str.strip, csv_string.split(',')))))
def parse_list_of_CSV_strings(list_of_csv_strings):
# Parses a list of CSV strings with invoice numbers into a list of lists
parsed_csv_list = []
for csv_string in list_of_csv_strings:
parsed_csv_list.append(parse_CSV_string(csv_string))
return parsed_csv_list
def parse_invoice_number(s):
# Return the invoice number in Siemens' format if found, otherwise just return the string
rp = r'^\s*?([\S\d]+\d{6})'
m = re.search(rp, s)
return m.group(1) if m else s
def standardize_number(s):
# Find the last occurrence of a comma or period
last_separator_index = max(s.rfind(','), s.rfind('.'))
if last_separator_index != -1:
# Split the string into two parts
before_separator = s[:last_separator_index]
after_separator = s[last_separator_index+1:]
# Clean the first part of any commas, periods, or whitespace
before_separator_cleaned = re.sub(r'[.,\s]', '', before_separator)
# Ensure the decimal part starts with a period, even if it was a comma
standardized_s = before_separator_cleaned + '.' + after_separator
else:
# If there's no separator, just remove commas, periods, or whitespace
standardized_s = re.sub(r'[.,\s]', '', s)
return standardized_s
def remove_chars_after_last_digit(s):
# Remove any non-digit characters following the last digit in the string
return re.sub(r'(?<=\d)[^\d]*$', '', s)
def clean_text(s):
# This pattern looks for:
# - Optional non-digit or non-negative sign characters followed by whitespace (if any)
# - Followed by any characters until a digit is found in the word
# It then replaces this matched portion with the remaining part of the word from the first digit
# cleaned_s = re.sub(r'\S*?\s*?(\S*\d\S*)', r'\1', s)
cleaned_s = re.sub(r'[^\d-]*\s?(\S*\d\S*)', r'\1', s)
return cleaned_s
def format_text_decimal(text_decimal):
# Run functions to format a text decimal
return clean_text(remove_chars_after_last_digit(standardize_number(text_decimal.strip().lower())))
# PDF handling
def extract_text_using_pdfplumber(file_path):
# TODO: add check for text vs images padf
with pdfplumber.open(file_path) as pdf:
extracted_text = ''
for i, page in enumerate(pdf.pages):
# Remove duplicate characters from the page.
deduped_page = page.dedupe_chars(tolerance=1)
extracted_text += deduped_page.extract_text()
return extracted_text
def get_PDF_examples(directory):
example_pdf_files = []
for filename in os.listdir(directory):
if filename.endswith('.pdf'):
example_pdf_files.append(os.path.join(directory, filename))
return example_pdf_files
# Signatures and Models
class FindInvoiceNumberColumns(dspy.Signature):
"""Given an input remittance letter, return a list of column header names that may contain invoice numbers."""
content = dspy.InputField(desc="remittance letter", format=lambda s:s) # s:s so it doesn't skip the new lines
column_header_names = dspy.OutputField(desc="comma-separated list of column header names that may contain "\
"invoice numbers")
class InvoiceColumnHeaders(dspy.Module):
def __init__(self):
super().__init__()
# self.potential_invoice_column_headers = dspy.ChainOfThought(FindInvoiceNumberColumns)
self.potential_invoice_column_headers = dspy.Predict(FindInvoiceNumberColumns) # Ervin suggests Predict
def forward(self, file_content):
prediction = self.potential_invoice_column_headers(content=file_content)
# NOTE: Instead of a prediction we could return a simple list (for consistency with my other Modules)
# or even a parsed list (not CSV)
return prediction
# This creates a new Prediction object adding the File Content
# return Prediction(content=file_content, column_header_names=prediction.column_header_names, rationale=prediction.rationale)
# Creating a new Prediction object with extra data can be useful if we need more data for the verification
class FindInvoiceList(dspy.Signature):
"""Given an input remittance letter and a column header name output a comma-separated list of all invoice numbers """\
"""that belong to that column."""
content = dspy.InputField(desc="remittance letter", format=lambda s:s) # s:s so it doesn't skip the new lines
invoice_column_header = dspy.InputField(desc="invoice column header name")
candidate_invoice_numbers = dspy.OutputField(desc="comma-separated list of invoice numbers")
class InvoiceList(dspy.Module):
def __init__(self):
super().__init__()
self.find_invoice_headers = InvoiceColumnHeaders() # here we could load a compiled program also
self.find_invoice_numbers = dspy.Predict(FindInvoiceList)
def forward(self, file_content):
# Predict column headers (returns a Prediction with a CSV string in "column_header_names")
predict_column_headers = self.find_invoice_headers(file_content=file_content)
# Parse CSV into a list
potential_invoice_column_headers = parse_CSV_string(predict_column_headers.column_header_names)
potential_invoices = []
for header in potential_invoice_column_headers:
prediction = self.find_invoice_numbers(content=file_content, invoice_column_header=header)
potential_invoices.append(prediction.candidate_invoice_numbers)
# Remove duplicates
# potential_invoices = list(set(potential_invoices))
potential_invoices = parse_list_of_CSV_strings(potential_invoices) # TODO: remove duplicated lists
# return Prediction(candidate_invoice_numbers=candidates, column_header_names=col_names)
# return potential_invoices
# We need to return a Prediction for the Evaluate function later on
return Prediction(candidate_invoice_numbers=potential_invoices)
class FindTotalAmountColumns(dspy.Signature):
"""Given an input remittance letter, return a list of column header names that may contain the total payment amount."""
content = dspy.InputField(desc="remittance letter", format=lambda s:s) # s:s so it doesn't skip the new lines
total_column_header_names = dspy.OutputField(desc="comma-separated list of column header names that may contain "\
"the remittance letter total payment amount")
class TotalAmountColumnHeaders(dspy.Module):
def __init__(self):
super().__init__()
self.potential_total_amount_column_headers = dspy.Predict(FindTotalAmountColumns)
def forward(self, file_content):
prediction = self.potential_total_amount_column_headers(content=file_content)
return prediction
class FindTotalAmount(dspy.Signature):
"""Given an input remittance letter and a column header name output the total payment amount """\
"""that belongs to that column."""
content = dspy.InputField(desc="remittance letter", format=lambda s:s) # s:s so it doesn't skip the new lines
total_amount_column_header = dspy.InputField(desc="total amount header name")
total_amount = dspy.OutputField(desc="total payment amount")
class RemittanceLetterTotalAmount(dspy.Module):
def __init__(self):
super().__init__()
# self.find_invoice_list = InvoiceList()
self.find_total_amount_header = TotalAmountColumnHeaders()
self.find_total_amount = dspy.Predict(FindTotalAmount)
def forward(self, file_content):
# Predict invoice list - we could do this here, but let's just call the 2 modules from a function instead
# if we called the invoice list prediction here, we should return an object with both the potential total amounts
# and the potential invoice lists
# predict_invoice_list = self.find_invoice_list(file_content=file_content)
# Predict column headers (returns a Prediction with a CSV string in "column_header_names")
predict_column_headers = self.find_total_amount_header(file_content=file_content)
# Parse CSV into a list
potential_total_amount_column_headers = parse_CSV_string(predict_column_headers.total_column_header_names)
potential_total_amounts = []
for header in potential_total_amount_column_headers:
prediction = self.find_total_amount(content=file_content, total_amount_column_header=header)
potential_total_amounts.append(prediction.total_amount)
# Remove duplicates
potential_total_amounts = list(set(potential_total_amounts))
return Prediction(candidate_total_amounts=potential_total_amounts) # I could just return "prediction" also (references to candidate_total_amounts should change then)
# Pipeline
def poc_production_pipeline_without_verification(file_content):
# TODO: place this in a module - init allows to pass a compiled module and forward handles the data:
# so we can evaluate the pipeline (check if any tuple matches the verifier)
# Get invoice candidates
invoice_list_baseline = InvoiceList()
candidate_invoices = invoice_list_baseline(file_content=file_content).candidate_invoice_numbers
# Get total amount candidates
total_amount_baseline = RemittanceLetterTotalAmount()
# Format all decimals
candidate_total_amounts = list(map(format_text_decimal,
total_amount_baseline(file_content=file_content).candidate_total_amounts))
# For UI visualisation purposes, create a list of tuples where the second tuple value is empty
candidate_invoices_for_UI = []
candidate_total_amounts_for_UI = []
for candidate in candidate_invoices:
candidate_invoices_for_UI.append((candidate,))
for candidate in candidate_total_amounts:
candidate_total_amounts_for_UI.append((candidate,))
return candidate_invoices_for_UI, candidate_total_amounts_for_UI
def poc_production_pipeline_without_verification_from_PDF(file_path):
file_content = extract_text_using_pdfplumber(file_path)
# return str(poc_production_pipeline_without_verification(file_content))
return poc_production_pipeline_without_verification(file_content)
# Main app
fake_PDF_examples = get_PDF_examples(pdf_examples_dir)
remittance_letter_demo_without_verification_from_PDF = gr.Interface(
poc_production_pipeline_without_verification_from_PDF,
[PDF(label="Remittance advice", height=1000)],
[
gr.Dataframe(col_count=(1, 'fixed'), label="", headers=["Retrieved invoice proposals"], wrap=True),
gr.Dataframe(col_count=(1, 'fixed'), label="", headers=["Retrieved total amount proposals"], wrap=True)
],
examples=fake_PDF_examples,
allow_flagging='never'
)
remittance_letter_demo_without_verification_from_PDF.launch() |