import json import pandas as pd import os from pypdf import PdfReader import re from tqdm import tqdm from openai import AzureOpenAI, Client from prompts import ESTIMATE_COST_OF_CAR_PARTS_REPLACEMENT_PROMPT import ast import time def extract_invoice_tables(invoices_path: str) -> list[str]: invoices_paths = [os.path.join(invoices_path, invoice) for invoice in os.listdir(invoices_path) if invoice.endswith(".pdf")] invoices_tables = [] for invoice_path in tqdm(invoices_paths): reader = PdfReader(invoice_path) page = reader.pages[0] text = page.extract_text() table_text = re.search(r"Beschädigtes Teil.*?Gesamtsumme:.*?EUR", text, re.DOTALL).group() lines = table_text.splitlines() header = lines[0] other_text = "\n".join(lines[1:]) cleaned_text = re.sub(r"(? list[tuple[str, str, str]]: car_parts = [] for invoice in invoices: car_parts += [ ( line.split(" | ")[0].replace("(rechts)", "").replace("(links)", "").replace("links", "").replace("rechts", "").strip(), line.split(" | ")[1], line.split(" | ")[2] ) for line in invoice.split("\n")[1:-1] ] return car_parts def prompt_model_for_cost_estimation( parts_to_estimate: list[str], batch_size: int = 100 ) -> list[tuple[str, float, float]]: car_parts = [] client = AzureOpenAI( api_key=os.environ["AZURE_API_KEY"], api_version=os.environ["AZURE_API_VERSION"], azure_endpoint=os.environ["AZURE_ENDPOINT"], ) for i in tqdm(range(0, len(parts_to_estimate), batch_size)): try: batch = parts_to_estimate[i:i + batch_size] response = generate_response(client, batch) car_parts.extend(ast.literal_eval(response)) time.sleep(5) except Exception as e: print(f"Error: {e}") return car_parts def estimate_costs(invoices_folder_path: str, path_to_save_json: str) -> pd.DataFrame: invoices = extract_invoice_tables(invoices_folder_path) car_parts = get_car_parts(invoices) car_parts_df = pd.DataFrame(car_parts, columns=["car_part", "cost", "hours"]) parts_to_estimate = car_parts_df["car_part"].unique().tolist() for i in range(3): car_parts += prompt_model_for_cost_estimation(parts_to_estimate) car_parts_df = pd.DataFrame(car_parts, columns=["car_part", "cost", "hours"]) car_parts_df["cost"] = car_parts_df["cost"].astype(float) car_parts_df["hours"] = car_parts_df["hours"].astype(float) car_parts_df_grouped = car_parts_df.groupby("car_part").agg( {"cost": ["mean", "min", "max", "count"], "hours": ["mean", "min", "max"]} ) car_parts_df_grouped.columns = [ "average_cost", "cost_min", "cost_max", "count", "average_hours", "hours_min", "hours_max" ] car_parts_dict = car_parts_df_grouped.to_dict( orient="index" ) with open(path_to_save_json, "w", encoding="utf-8") as f: json.dump(car_parts_dict, f, ensure_ascii=False, indent=4) return car_parts_df_grouped