|
import json |
|
import pandas as pd |
|
import os |
|
from pypdf import PdfReader |
|
import re |
|
from tqdm import tqdm |
|
from openai import AzureOpenAI, Client |
|
from prompts import ESTIMATE_COST_OF_CAR_PARTS_REPLACEMENT_PROMPT |
|
import ast |
|
import time |
|
|
|
|
|
def extract_invoice_tables(invoices_path: str) -> list[str]: |
|
invoices_paths = [os.path.join(invoices_path, invoice) for invoice in os.listdir(invoices_path) if |
|
invoice.endswith(".pdf")] |
|
invoices_tables = [] |
|
for invoice_path in tqdm(invoices_paths): |
|
reader = PdfReader(invoice_path) |
|
page = reader.pages[0] |
|
text = page.extract_text() |
|
|
|
table_text = re.search(r"Beschädigtes Teil.*?Gesamtsumme:.*?EUR", text, re.DOTALL).group() |
|
|
|
lines = table_text.splitlines() |
|
header = lines[0] |
|
other_text = "\n".join(lines[1:]) |
|
cleaned_text = re.sub(r"(?<!\d)\n", " ", other_text) |
|
|
|
table = header + "\n" + cleaned_text |
|
|
|
inv = table.split("\n") |
|
reformatted_inv = "Beschädigtes Teil | Teilkosten (EUR) | Arbeitsstunden | Arbeitskosten (EUR/Stunde) | Gesamtkosten (EUR)\n" + "\n".join( |
|
" ".join(inv[i].split(" ")[:-4]) + " | " + ' | '.join(inv[i].split(" ")[-4:]) for i in |
|
range(1, len(inv) - 1)) + "\n" + inv[-1] |
|
|
|
invoices_tables.append(reformatted_inv) |
|
|
|
return invoices_tables |
|
|
|
|
|
def generate_response(client: Client, car_parts: list[str]): |
|
|
|
response = client.chat.completions.create( |
|
model="gpt-4o", |
|
messages=[ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{"type": "text", "text": ESTIMATE_COST_OF_CAR_PARTS_REPLACEMENT_PROMPT("\n".join(car_parts))}, |
|
], |
|
} |
|
], |
|
max_tokens=1024, |
|
temperature=0.5, |
|
) |
|
|
|
message = response.choices[0].message.content |
|
message = message.replace("```python", "").replace("```", "") |
|
return message |
|
|
|
|
|
def get_car_parts(invoices: list[str]) -> list[tuple[str, str, str]]: |
|
car_parts = [] |
|
for invoice in invoices: |
|
car_parts += [ |
|
( |
|
line.split(" | ")[0].replace("(rechts)", "").replace("(links)", "").replace("links", "").replace("rechts", "").strip(), |
|
line.split(" | ")[1], |
|
line.split(" | ")[2] |
|
) |
|
for line in invoice.split("\n")[1:-1] |
|
] |
|
return car_parts |
|
|
|
|
|
def prompt_model_for_cost_estimation( |
|
parts_to_estimate: list[str], |
|
batch_size: int = 100 |
|
) -> list[tuple[str, float, float]]: |
|
car_parts = [] |
|
client = AzureOpenAI( |
|
api_key=os.environ["AZURE_API_KEY"], |
|
api_version=os.environ["AZURE_API_VERSION"], |
|
azure_endpoint=os.environ["AZURE_ENDPOINT"], |
|
) |
|
for i in tqdm(range(0, len(parts_to_estimate), batch_size)): |
|
try: |
|
batch = parts_to_estimate[i:i + batch_size] |
|
response = generate_response(client, batch) |
|
car_parts.extend(ast.literal_eval(response)) |
|
time.sleep(5) |
|
except Exception as e: |
|
print(f"Error: {e}") |
|
return car_parts |
|
|
|
|
|
def estimate_costs(invoices_folder_path: str, path_to_save_json: str) -> pd.DataFrame: |
|
invoices = extract_invoice_tables(invoices_folder_path) |
|
car_parts = get_car_parts(invoices) |
|
|
|
car_parts_df = pd.DataFrame(car_parts, columns=["car_part", "cost", "hours"]) |
|
parts_to_estimate = car_parts_df["car_part"].unique().tolist() |
|
for i in range(3): |
|
car_parts += prompt_model_for_cost_estimation(parts_to_estimate) |
|
|
|
car_parts_df = pd.DataFrame(car_parts, columns=["car_part", "cost", "hours"]) |
|
|
|
car_parts_df["cost"] = car_parts_df["cost"].astype(float) |
|
car_parts_df["hours"] = car_parts_df["hours"].astype(float) |
|
car_parts_df_grouped = car_parts_df.groupby("car_part").agg( |
|
{"cost": ["mean", "min", "max", "count"], "hours": ["mean", "min", "max"]} |
|
) |
|
car_parts_df_grouped.columns = [ |
|
"average_cost", "cost_min", "cost_max", "count", "average_hours", "hours_min", "hours_max" |
|
] |
|
|
|
car_parts_dict = car_parts_df_grouped.to_dict( |
|
orient="index" |
|
) |
|
|
|
with open(path_to_save_json, "w", encoding="utf-8") as f: |
|
json.dump(car_parts_dict, f, ensure_ascii=False, indent=4) |
|
|
|
return car_parts_df_grouped |
|
|