File size: 4,282 Bytes
1961add
 
 
 
 
 
50d43ea
 
 
 
1961add
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50d43ea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1961add
 
 
 
 
50d43ea
1961add
 
 
 
 
 
 
 
50d43ea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1961add
 
 
 
 
50d43ea
 
 
 
 
 
1961add
 
50d43ea
1961add
 
50d43ea
1961add
 
 
50d43ea
1961add
 
 
 
 
 
50d43ea
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import json
import pandas as pd
import os
from pypdf import PdfReader
import re
from tqdm import tqdm
from openai import AzureOpenAI, Client
from prompts import ESTIMATE_COST_OF_CAR_PARTS_REPLACEMENT_PROMPT
import ast
import time


def extract_invoice_tables(invoices_path: str) -> list[str]:
    invoices_paths = [os.path.join(invoices_path, invoice) for invoice in os.listdir(invoices_path) if
                      invoice.endswith(".pdf")]
    invoices_tables = []
    for invoice_path in tqdm(invoices_paths):
        reader = PdfReader(invoice_path)
        page = reader.pages[0]
        text = page.extract_text()

        table_text = re.search(r"Beschädigtes Teil.*?Gesamtsumme:.*?EUR", text, re.DOTALL).group()

        lines = table_text.splitlines()
        header = lines[0]
        other_text = "\n".join(lines[1:])
        cleaned_text = re.sub(r"(?<!\d)\n", " ", other_text)

        table = header + "\n" + cleaned_text

        inv = table.split("\n")
        reformatted_inv = "Beschädigtes Teil | Teilkosten (EUR) | Arbeitsstunden | Arbeitskosten (EUR/Stunde) | Gesamtkosten (EUR)\n" + "\n".join(
            " ".join(inv[i].split(" ")[:-4]) + " | " + ' | '.join(inv[i].split(" ")[-4:]) for i in
            range(1, len(inv) - 1)) + "\n" + inv[-1]

        invoices_tables.append(reformatted_inv)

    return invoices_tables


def generate_response(client: Client, car_parts: list[str]):

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": ESTIMATE_COST_OF_CAR_PARTS_REPLACEMENT_PROMPT("\n".join(car_parts))},
                ],
            }
        ],
        max_tokens=1024,
        temperature=0.5,
    )

    message = response.choices[0].message.content
    message = message.replace("```python", "").replace("```", "")
    return message


def get_car_parts(invoices: list[str]) -> list[tuple[str, str, str]]:
    car_parts = []
    for invoice in invoices:
        car_parts += [
            (
                line.split(" | ")[0].replace("(rechts)", "").replace("(links)", "").replace("links", "").replace("rechts", "").strip(),
                line.split(" | ")[1],
                line.split(" | ")[2]
            )
            for line in invoice.split("\n")[1:-1]
        ]
    return car_parts


def prompt_model_for_cost_estimation(
        parts_to_estimate: list[str],
        batch_size: int = 100
) -> list[tuple[str, float, float]]:
    car_parts = []
    client = AzureOpenAI(
        api_key=os.environ["AZURE_API_KEY"],
        api_version=os.environ["AZURE_API_VERSION"],
        azure_endpoint=os.environ["AZURE_ENDPOINT"],
    )
    for i in tqdm(range(0, len(parts_to_estimate), batch_size)):
        try:
            batch = parts_to_estimate[i:i + batch_size]
            response = generate_response(client, batch)
            car_parts.extend(ast.literal_eval(response))
            time.sleep(5)
        except Exception as e:
            print(f"Error: {e}")
    return car_parts


def estimate_costs(invoices_folder_path: str, path_to_save_json: str) -> pd.DataFrame:
    invoices = extract_invoice_tables(invoices_folder_path)
    car_parts = get_car_parts(invoices)

    car_parts_df = pd.DataFrame(car_parts, columns=["car_part", "cost", "hours"])
    parts_to_estimate = car_parts_df["car_part"].unique().tolist()
    for i in range(3):
        car_parts += prompt_model_for_cost_estimation(parts_to_estimate)

    car_parts_df = pd.DataFrame(car_parts, columns=["car_part", "cost", "hours"])

    car_parts_df["cost"] = car_parts_df["cost"].astype(float)
    car_parts_df["hours"] = car_parts_df["hours"].astype(float)
    car_parts_df_grouped = car_parts_df.groupby("car_part").agg(
        {"cost": ["mean", "min", "max", "count"], "hours": ["mean", "min", "max"]}
    )
    car_parts_df_grouped.columns = [
        "average_cost", "cost_min", "cost_max", "count", "average_hours", "hours_min", "hours_max"
    ]

    car_parts_dict = car_parts_df_grouped.to_dict(
        orient="index"
    )

    with open(path_to_save_json, "w", encoding="utf-8") as f:
        json.dump(car_parts_dict, f, ensure_ascii=False, indent=4)

    return car_parts_df_grouped