pixtral-demo / cost_estimation /cost_estimation.py
alexandraroze's picture
updated cost estimation
50d43ea
raw
history blame
4.28 kB
import json
import pandas as pd
import os
from pypdf import PdfReader
import re
from tqdm import tqdm
from openai import AzureOpenAI, Client
from prompts import ESTIMATE_COST_OF_CAR_PARTS_REPLACEMENT_PROMPT
import ast
import time
def extract_invoice_tables(invoices_path: str) -> list[str]:
invoices_paths = [os.path.join(invoices_path, invoice) for invoice in os.listdir(invoices_path) if
invoice.endswith(".pdf")]
invoices_tables = []
for invoice_path in tqdm(invoices_paths):
reader = PdfReader(invoice_path)
page = reader.pages[0]
text = page.extract_text()
table_text = re.search(r"Beschädigtes Teil.*?Gesamtsumme:.*?EUR", text, re.DOTALL).group()
lines = table_text.splitlines()
header = lines[0]
other_text = "\n".join(lines[1:])
cleaned_text = re.sub(r"(?<!\d)\n", " ", other_text)
table = header + "\n" + cleaned_text
inv = table.split("\n")
reformatted_inv = "Beschädigtes Teil | Teilkosten (EUR) | Arbeitsstunden | Arbeitskosten (EUR/Stunde) | Gesamtkosten (EUR)\n" + "\n".join(
" ".join(inv[i].split(" ")[:-4]) + " | " + ' | '.join(inv[i].split(" ")[-4:]) for i in
range(1, len(inv) - 1)) + "\n" + inv[-1]
invoices_tables.append(reformatted_inv)
return invoices_tables
def generate_response(client: Client, car_parts: list[str]):
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": ESTIMATE_COST_OF_CAR_PARTS_REPLACEMENT_PROMPT("\n".join(car_parts))},
],
}
],
max_tokens=1024,
temperature=0.5,
)
message = response.choices[0].message.content
message = message.replace("```python", "").replace("```", "")
return message
def get_car_parts(invoices: list[str]) -> list[tuple[str, str, str]]:
car_parts = []
for invoice in invoices:
car_parts += [
(
line.split(" | ")[0].replace("(rechts)", "").replace("(links)", "").replace("links", "").replace("rechts", "").strip(),
line.split(" | ")[1],
line.split(" | ")[2]
)
for line in invoice.split("\n")[1:-1]
]
return car_parts
def prompt_model_for_cost_estimation(
parts_to_estimate: list[str],
batch_size: int = 100
) -> list[tuple[str, float, float]]:
car_parts = []
client = AzureOpenAI(
api_key=os.environ["AZURE_API_KEY"],
api_version=os.environ["AZURE_API_VERSION"],
azure_endpoint=os.environ["AZURE_ENDPOINT"],
)
for i in tqdm(range(0, len(parts_to_estimate), batch_size)):
try:
batch = parts_to_estimate[i:i + batch_size]
response = generate_response(client, batch)
car_parts.extend(ast.literal_eval(response))
time.sleep(5)
except Exception as e:
print(f"Error: {e}")
return car_parts
def estimate_costs(invoices_folder_path: str, path_to_save_json: str) -> pd.DataFrame:
invoices = extract_invoice_tables(invoices_folder_path)
car_parts = get_car_parts(invoices)
car_parts_df = pd.DataFrame(car_parts, columns=["car_part", "cost", "hours"])
parts_to_estimate = car_parts_df["car_part"].unique().tolist()
for i in range(3):
car_parts += prompt_model_for_cost_estimation(parts_to_estimate)
car_parts_df = pd.DataFrame(car_parts, columns=["car_part", "cost", "hours"])
car_parts_df["cost"] = car_parts_df["cost"].astype(float)
car_parts_df["hours"] = car_parts_df["hours"].astype(float)
car_parts_df_grouped = car_parts_df.groupby("car_part").agg(
{"cost": ["mean", "min", "max", "count"], "hours": ["mean", "min", "max"]}
)
car_parts_df_grouped.columns = [
"average_cost", "cost_min", "cost_max", "count", "average_hours", "hours_min", "hours_max"
]
car_parts_dict = car_parts_df_grouped.to_dict(
orient="index"
)
with open(path_to_save_json, "w", encoding="utf-8") as f:
json.dump(car_parts_dict, f, ensure_ascii=False, indent=4)
return car_parts_df_grouped