Commit
·
1961add
1
Parent(s):
c4c635a
added cost_estimation
Browse files
cost_estimation/cost_estimation.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import pandas as pd
|
3 |
+
import os
|
4 |
+
from pypdf import PdfReader
|
5 |
+
import re
|
6 |
+
from tqdm import tqdm
|
7 |
+
|
8 |
+
|
9 |
+
def extract_invoice_tables(invoices_path: str) -> list[str]:
|
10 |
+
invoices_paths = [os.path.join(invoices_path, invoice) for invoice in os.listdir(invoices_path) if
|
11 |
+
invoice.endswith(".pdf")]
|
12 |
+
invoices_tables = []
|
13 |
+
for invoice_path in tqdm(invoices_paths):
|
14 |
+
reader = PdfReader(invoice_path)
|
15 |
+
page = reader.pages[0]
|
16 |
+
text = page.extract_text()
|
17 |
+
|
18 |
+
table_text = re.search(r"Beschädigtes Teil.*?Gesamtsumme:.*?EUR", text, re.DOTALL).group()
|
19 |
+
|
20 |
+
lines = table_text.splitlines()
|
21 |
+
header = lines[0]
|
22 |
+
other_text = "\n".join(lines[1:])
|
23 |
+
cleaned_text = re.sub(r"(?<!\d)\n", " ", other_text)
|
24 |
+
|
25 |
+
table = header + "\n" + cleaned_text
|
26 |
+
|
27 |
+
inv = table.split("\n")
|
28 |
+
reformatted_inv = "Beschädigtes Teil | Teilkosten (EUR) | Arbeitsstunden | Arbeitskosten (EUR/Stunde) | Gesamtkosten (EUR)\n" + "\n".join(
|
29 |
+
" ".join(inv[i].split(" ")[:-4]) + " | " + ' | '.join(inv[i].split(" ")[-4:]) for i in
|
30 |
+
range(1, len(inv) - 1)) + "\n" + inv[-1]
|
31 |
+
|
32 |
+
invoices_tables.append(reformatted_inv)
|
33 |
+
|
34 |
+
return invoices_tables
|
35 |
+
|
36 |
+
|
37 |
+
def get_car_parts(invoices: list[str]) -> list[tuple[str, str, str]]:
|
38 |
+
car_parts = []
|
39 |
+
for invoice in invoices:
|
40 |
+
car_parts += [
|
41 |
+
(
|
42 |
+
line.split(" | ")[0].replace("(rechts)", "").replace("(links)", "").strip(),
|
43 |
+
line.split(" | ")[1],
|
44 |
+
line.split(" | ")[2]
|
45 |
+
)
|
46 |
+
for line in invoice.split("\n")[1:-1]
|
47 |
+
]
|
48 |
+
return car_parts
|
49 |
+
|
50 |
+
|
51 |
+
def estimate_costs(invoices_folder_path: str, path_to_save_json: str) -> pd.DataFrame:
|
52 |
+
invoices = extract_invoice_tables(invoices_folder_path)
|
53 |
+
car_parts = get_car_parts(invoices)
|
54 |
+
|
55 |
+
car_parts_df = pd.DataFrame(car_parts, columns=["car_part", "cost", "hours"])
|
56 |
+
car_parts_df["cost"] = car_parts_df["cost"].astype(float)
|
57 |
+
car_parts_df["hours"] = car_parts_df["hours"].astype(float)
|
58 |
+
car_parts_df = car_parts_df.groupby("car_part").agg(
|
59 |
+
{"cost": ["mean", "min", "max", "count"], "hours": ["mean", "min", "max"]}
|
60 |
+
)
|
61 |
+
car_parts_df.columns = [
|
62 |
+
"average_cost", "cost_min", "cost_max", "count", "average_hours", "hours_min", "hours_max"
|
63 |
+
]
|
64 |
+
|
65 |
+
car_parts_dict = car_parts_df.to_dict(
|
66 |
+
orient="index"
|
67 |
+
)
|
68 |
+
|
69 |
+
with open(path_to_save_json, "w", encoding="utf-8") as f:
|
70 |
+
json.dump(car_parts_dict, f, ensure_ascii=False, indent=4)
|
71 |
+
|
72 |
+
return car_parts_df
|