File size: 4,282 Bytes
1961add 50d43ea 1961add 50d43ea 1961add 50d43ea 1961add 50d43ea 1961add 50d43ea 1961add 50d43ea 1961add 50d43ea 1961add 50d43ea 1961add 50d43ea |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
import json
import pandas as pd
import os
from pypdf import PdfReader
import re
from tqdm import tqdm
from openai import AzureOpenAI, Client
from prompts import ESTIMATE_COST_OF_CAR_PARTS_REPLACEMENT_PROMPT
import ast
import time
def extract_invoice_tables(invoices_path: str) -> list[str]:
invoices_paths = [os.path.join(invoices_path, invoice) for invoice in os.listdir(invoices_path) if
invoice.endswith(".pdf")]
invoices_tables = []
for invoice_path in tqdm(invoices_paths):
reader = PdfReader(invoice_path)
page = reader.pages[0]
text = page.extract_text()
table_text = re.search(r"Beschädigtes Teil.*?Gesamtsumme:.*?EUR", text, re.DOTALL).group()
lines = table_text.splitlines()
header = lines[0]
other_text = "\n".join(lines[1:])
cleaned_text = re.sub(r"(?<!\d)\n", " ", other_text)
table = header + "\n" + cleaned_text
inv = table.split("\n")
reformatted_inv = "Beschädigtes Teil | Teilkosten (EUR) | Arbeitsstunden | Arbeitskosten (EUR/Stunde) | Gesamtkosten (EUR)\n" + "\n".join(
" ".join(inv[i].split(" ")[:-4]) + " | " + ' | '.join(inv[i].split(" ")[-4:]) for i in
range(1, len(inv) - 1)) + "\n" + inv[-1]
invoices_tables.append(reformatted_inv)
return invoices_tables
def generate_response(client: Client, car_parts: list[str]):
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": ESTIMATE_COST_OF_CAR_PARTS_REPLACEMENT_PROMPT("\n".join(car_parts))},
],
}
],
max_tokens=1024,
temperature=0.5,
)
message = response.choices[0].message.content
message = message.replace("```python", "").replace("```", "")
return message
def get_car_parts(invoices: list[str]) -> list[tuple[str, str, str]]:
car_parts = []
for invoice in invoices:
car_parts += [
(
line.split(" | ")[0].replace("(rechts)", "").replace("(links)", "").replace("links", "").replace("rechts", "").strip(),
line.split(" | ")[1],
line.split(" | ")[2]
)
for line in invoice.split("\n")[1:-1]
]
return car_parts
def prompt_model_for_cost_estimation(
parts_to_estimate: list[str],
batch_size: int = 100
) -> list[tuple[str, float, float]]:
car_parts = []
client = AzureOpenAI(
api_key=os.environ["AZURE_API_KEY"],
api_version=os.environ["AZURE_API_VERSION"],
azure_endpoint=os.environ["AZURE_ENDPOINT"],
)
for i in tqdm(range(0, len(parts_to_estimate), batch_size)):
try:
batch = parts_to_estimate[i:i + batch_size]
response = generate_response(client, batch)
car_parts.extend(ast.literal_eval(response))
time.sleep(5)
except Exception as e:
print(f"Error: {e}")
return car_parts
def estimate_costs(invoices_folder_path: str, path_to_save_json: str) -> pd.DataFrame:
invoices = extract_invoice_tables(invoices_folder_path)
car_parts = get_car_parts(invoices)
car_parts_df = pd.DataFrame(car_parts, columns=["car_part", "cost", "hours"])
parts_to_estimate = car_parts_df["car_part"].unique().tolist()
for i in range(3):
car_parts += prompt_model_for_cost_estimation(parts_to_estimate)
car_parts_df = pd.DataFrame(car_parts, columns=["car_part", "cost", "hours"])
car_parts_df["cost"] = car_parts_df["cost"].astype(float)
car_parts_df["hours"] = car_parts_df["hours"].astype(float)
car_parts_df_grouped = car_parts_df.groupby("car_part").agg(
{"cost": ["mean", "min", "max", "count"], "hours": ["mean", "min", "max"]}
)
car_parts_df_grouped.columns = [
"average_cost", "cost_min", "cost_max", "count", "average_hours", "hours_min", "hours_max"
]
car_parts_dict = car_parts_df_grouped.to_dict(
orient="index"
)
with open(path_to_save_json, "w", encoding="utf-8") as f:
json.dump(car_parts_dict, f, ensure_ascii=False, indent=4)
return car_parts_df_grouped
|