Spaces:
Sleeping
Sleeping
File size: 11,900 Bytes
275359b 7698d94 275359b 7698d94 275359b a8e9aaf 275359b a8e9aaf 275359b a8e9aaf 6f94dc6 a8e9aaf 275359b 6f94dc6 275359b 6f94dc6 275359b 6f94dc6 275359b a8e9aaf 275359b a8e9aaf 275359b a8e9aaf 275359b a8e9aaf 275359b a8e9aaf 275359b a8e9aaf 275359b a8e9aaf 275359b a8e9aaf 275359b a8e9aaf 275359b a8e9aaf 275359b 6f94dc6 275359b a8e9aaf 275359b 4df9fa6 275359b 9aedcaa 275359b 9aedcaa 275359b 2417224 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 |
import os
#DSPY
import dspy
from dspy import Prediction
from dspy.evaluate import Evaluate
from dspy import Prediction
from dspy.teleprompt import BootstrapFewShot
from dspy.teleprompt import BootstrapFewShotWithRandomSearch
# Data handling
# import pandas as pd
# Calculations and formatting
import re
from decimal import Decimal
# UI
import gradio as gr
from gradio_pdf import PDF
# PDF handling
import pdfplumber
pdf_examples_dir = './pdfexamples/'
model = dspy.OpenAI(
model='gpt-3.5-turbo-0125',
api_key=os.getenv('OPENAI_PROJECT_KEY'),
max_tokens=2000,
temperature=0.01)
dspy.settings.configure(lm=model)
# Utils
def parse_CSV_string(csv_string):
# Parses a CSV string into a list
return list(map(str.strip, csv_string.split(',')))
def parse_CSV_string_to_unique(csv_string):
# Parses a CSV string into a unique list
return list(set(map(str.lower, map(str.strip, csv_string.split(',')))))
def parse_list_of_CSV_strings(list_of_csv_strings):
# Parses a list of CSV strings with invoice numbers into a list of lists
parsed_csv_list = []
for csv_string in list_of_csv_strings:
parsed_csv_list.append(parse_CSV_string_to_unique(csv_string))
return parsed_csv_list
def parse_column_names(s):
"""
Parse a comma-separated list of column names from a string.
Removes the prefix string before splitting the string.
Args:
s: raw response from the model, comma-separated list of column names (string)
Returns:
list of column names (list of strings)
"""
prefix = 'Column Header Names:'
prefix_length = len(prefix)
# r_index = s.rfind(prefix)
# s = s[r_index+prefix_length:] if r_index != -1 else s
if s.strip().lower().startswith(prefix.lower()):
s = s[prefix_length:]
return list(map(str.strip,s.split(',')))
def remove_duplicate_lists(lists):
"""
Remove duplicate lists from a list of lists.
Args:
lists:
a list of lists of strings
Returns:
a list of lists of strings, where each list is unique
"""
seen = set()
unique_lists = []
for lst in lists:
sorted_list = tuple(sorted(lst))
if sorted_list not in seen:
seen.add(sorted_list)
unique_lists.append(lst)
return unique_lists
def parse_invoice_number(s):
# Return the invoice number in a specific format if found, otherwise just return the input string
rp = r'^\s*?([\S\d]+\d{6})'
m = re.search(rp, s)
return m.group(1) if m else s
def standardize_number(s):
# Find the last occurrence of a comma or period
last_separator_index = max(s.rfind(','), s.rfind('.'))
if last_separator_index != -1:
# Split the string into two parts
before_separator = s[:last_separator_index]
after_separator = s[last_separator_index+1:]
# Clean the first part of any commas, periods, or whitespace
before_separator_cleaned = re.sub(r'[.,\s]', '', before_separator)
# Ensure the decimal part starts with a period, even if it was a comma
standardized_s = before_separator_cleaned + '.' + after_separator
else:
# If there's no separator, just remove commas, periods, or whitespace
standardized_s = re.sub(r'[.,\s]', '', s)
return standardized_s
def remove_chars_after_last_digit(s):
# Remove any non-digit characters following the last digit in the string
return re.sub(r'(?<=\d)[^\d]*$', '', s)
def clean_text(s):
# This pattern looks for:
# - Optional non-digit or non-negative sign characters followed by whitespace (if any)
# - Followed by any characters until a digit is found in the word
# It then replaces this matched portion with the remaining part of the word from the first digit
# cleaned_s = re.sub(r'\S*?\s*?(\S*\d\S*)', r'\1', s)
cleaned_s = re.sub(r'[^\d-]*\s?(\S*\d\S*)', r'\1', s)
return cleaned_s
def format_text_decimal(text_decimal):
# Run functions to format a text decimal
return clean_text(remove_chars_after_last_digit(standardize_number(text_decimal.strip().lower())))
# PDF handling
def extract_text_using_pdfplumber(file_path):
# TODO: add check for text vs image PDF
with pdfplumber.open(file_path) as pdf:
extracted_text = ''
for i, page in enumerate(pdf.pages):
# Remove duplicate characters from the page
deduped_page = page.dedupe_chars(tolerance=1)
extracted_text += deduped_page.extract_text()
return extracted_text
def get_PDF_examples(directory):
example_pdf_files = []
for filename in os.listdir(directory):
if filename.endswith('.pdf'):
example_pdf_files.append(os.path.join(directory, filename))
return example_pdf_files
# Signatures and Models
class FindInvoiceNumberColumns(dspy.Signature):
"""Given an input remittance letter, return a list of column header names that may contain invoice numbers."""
content = dspy.InputField(desc="remittance letter", format=lambda s:s) # s:s so it doesn't skip the new lines
column_header_names = dspy.OutputField(desc="comma-separated list of column header names that may contain "\
"invoice numbers")
class InvoiceColumnHeaders(dspy.Module):
"""
Predict the column headers containing invoice numbers from the remittance letter.
Attributes:
response_parser: a function that takes a string and returns a list of strings.
"""
def __init__(self, response_parser=parse_CSV_string):
super().__init__()
self.response_parser = response_parser
self.potential_invoice_column_headers = dspy.Predict(FindInvoiceNumberColumns)
def forward(self, file_content):
prediction = self.potential_invoice_column_headers(content=file_content)
# Remove duplicates from the prediction
unique_headers = list(set(self.response_parser(prediction.column_header_names)))
# Create a new Prediction object with the unique headers
return Prediction(column_header_names=unique_headers)
class FindInvoiceList(dspy.Signature):
"""Given an input remittance letter and a column header name output a comma-separated list of all invoice numbers """\
"""that belong to that column."""
content = dspy.InputField(desc="remittance letter", format=lambda s:s) # s:s so it doesn't skip the new lines
invoice_column_header = dspy.InputField(desc="invoice column header name")
candidate_invoice_numbers = dspy.OutputField(desc="comma-separated list of invoice numbers")
class InvoiceList(dspy.Module):
"""
Retrieves a list of list of potential invoice numbers from a remittance letter.
Attributes:
response_parser: A function that takes a string and returns a list of invoice numbers.
Returns:
A Prediction object with the following fields:
candidate_invoice_numbers: A list of lists of invoice numbers.
"""
def __init__(self, response_parser=parse_CSV_string_to_unique):
super().__init__()
self.response_parser = response_parser
self.find_invoice_headers = InvoiceColumnHeaders(response_parser=parse_column_names) # here we could load a compiled program also
self.find_invoice_numbers = dspy.Predict(FindInvoiceList)
def forward(self, file_content):
predict_column_headers = self.find_invoice_headers(file_content=file_content)
potential_invoice_column_headers = predict_column_headers.column_header_names
candidates = []
for header in potential_invoice_column_headers:
prediction = self.find_invoice_numbers(content=file_content, invoice_column_header=header)
invoice_number_list = self.response_parser(prediction.candidate_invoice_numbers)
candidates.append(invoice_number_list)
# Remove duplicates
candidates = remove_duplicate_lists(candidates)
return Prediction(candidate_invoice_numbers=candidates)
class FindTotalAmountColumns(dspy.Signature):
"""Given an input remittance letter, return a list of column header names that may contain the total payment amount."""
content = dspy.InputField(desc="remittance letter", format=lambda s:s) # s:s so it doesn't skip the new lines
total_column_header_names = dspy.OutputField(desc="comma-separated list of column header names that may contain "\
"the remittance letter total payment amount")
class TotalAmountColumnHeaders(dspy.Module):
def __init__(self):
super().__init__()
self.potential_total_amount_column_headers = dspy.Predict(FindTotalAmountColumns)
def forward(self, file_content):
prediction = self.potential_total_amount_column_headers(content=file_content)
return prediction
class FindTotalAmount(dspy.Signature):
"""Given an input remittance letter and a column header name output the total payment amount """\
"""that belongs to that column."""
content = dspy.InputField(desc="remittance letter", format=lambda s:s) # s:s so it doesn't skip the new lines
total_amount_column_header = dspy.InputField(desc="total amount header name")
total_amount = dspy.OutputField(desc="total payment amount")
class RemittanceLetterTotalAmount(dspy.Module):
def __init__(self):
super().__init__()
self.find_total_amount_header = TotalAmountColumnHeaders()
self.find_total_amount = dspy.Predict(FindTotalAmount)
def forward(self, file_content):
# Predict column headers (returns a Prediction with a CSV string in "column_header_names")
predict_column_headers = self.find_total_amount_header(file_content=file_content)
# Parse CSV into a list
potential_total_amount_column_headers = parse_CSV_string_to_unique(predict_column_headers.total_column_header_names)
potential_total_amounts = []
for header in potential_total_amount_column_headers:
prediction = self.find_total_amount(content=file_content, total_amount_column_header=header)
potential_total_amounts.append(prediction.total_amount)
# Remove duplicates
potential_total_amounts = list(set(potential_total_amounts))
return Prediction(candidate_total_amounts=potential_total_amounts)
# Pipeline
def poc_production_pipeline_without_verification(file_content):
# Get invoice candidates
invoice_list_baseline = InvoiceList()
candidate_invoices = invoice_list_baseline(file_content=file_content).candidate_invoice_numbers
candidate_invoices = [",".join(lst) for lst in candidate_invoices]
# Get total amount candidates
total_amount_baseline = RemittanceLetterTotalAmount()
# Format all decimals
candidate_total_amounts = list(map(format_text_decimal,
total_amount_baseline(file_content=file_content).candidate_total_amounts))
# Only keep unique amounts
candidate_total_amounts = list(set(candidate_total_amounts))
# For UI visualisation purposes, create a list of tuples where the second tuple value is empty
candidate_invoices_for_UI = []
candidate_total_amounts_for_UI = []
for candidate in candidate_invoices:
candidate_invoices_for_UI.append((candidate,))
for candidate in candidate_total_amounts:
candidate_total_amounts_for_UI.append((candidate,))
return candidate_invoices_for_UI, candidate_total_amounts_for_UI
def poc_production_pipeline_without_verification_from_PDF(file_path):
file_content = extract_text_using_pdfplumber(file_path)
return poc_production_pipeline_without_verification(file_content)
# Main app
fake_PDF_examples = get_PDF_examples(pdf_examples_dir)
remittance_letter_demo_without_verification_from_PDF = gr.Interface(
poc_production_pipeline_without_verification_from_PDF,
[PDF(label="Remittance advice", height=1000)],
[
gr.Dataframe(col_count=(1, 'fixed'), label="", headers=["Retrieved invoice proposals"], wrap=True),
gr.Dataframe(col_count=(1, 'fixed'), label="", headers=["Retrieved total amount proposals"], wrap=True)
],
examples=fake_PDF_examples,
allow_flagging='never'
)
remittance_letter_demo_without_verification_from_PDF.launch() |