alexandraroze commited on
Commit
1961add
·
1 Parent(s): c4c635a

added cost_estimation

Browse files
Files changed (1) hide show
  1. cost_estimation/cost_estimation.py +72 -0
cost_estimation/cost_estimation.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import pandas as pd
3
+ import os
4
+ from pypdf import PdfReader
5
+ import re
6
+ from tqdm import tqdm
7
+
8
+
9
+ def extract_invoice_tables(invoices_path: str) -> list[str]:
10
+ invoices_paths = [os.path.join(invoices_path, invoice) for invoice in os.listdir(invoices_path) if
11
+ invoice.endswith(".pdf")]
12
+ invoices_tables = []
13
+ for invoice_path in tqdm(invoices_paths):
14
+ reader = PdfReader(invoice_path)
15
+ page = reader.pages[0]
16
+ text = page.extract_text()
17
+
18
+ table_text = re.search(r"Beschädigtes Teil.*?Gesamtsumme:.*?EUR", text, re.DOTALL).group()
19
+
20
+ lines = table_text.splitlines()
21
+ header = lines[0]
22
+ other_text = "\n".join(lines[1:])
23
+ cleaned_text = re.sub(r"(?<!\d)\n", " ", other_text)
24
+
25
+ table = header + "\n" + cleaned_text
26
+
27
+ inv = table.split("\n")
28
+ reformatted_inv = "Beschädigtes Teil | Teilkosten (EUR) | Arbeitsstunden | Arbeitskosten (EUR/Stunde) | Gesamtkosten (EUR)\n" + "\n".join(
29
+ " ".join(inv[i].split(" ")[:-4]) + " | " + ' | '.join(inv[i].split(" ")[-4:]) for i in
30
+ range(1, len(inv) - 1)) + "\n" + inv[-1]
31
+
32
+ invoices_tables.append(reformatted_inv)
33
+
34
+ return invoices_tables
35
+
36
+
37
+ def get_car_parts(invoices: list[str]) -> list[tuple[str, str, str]]:
38
+ car_parts = []
39
+ for invoice in invoices:
40
+ car_parts += [
41
+ (
42
+ line.split(" | ")[0].replace("(rechts)", "").replace("(links)", "").strip(),
43
+ line.split(" | ")[1],
44
+ line.split(" | ")[2]
45
+ )
46
+ for line in invoice.split("\n")[1:-1]
47
+ ]
48
+ return car_parts
49
+
50
+
51
+ def estimate_costs(invoices_folder_path: str, path_to_save_json: str) -> pd.DataFrame:
52
+ invoices = extract_invoice_tables(invoices_folder_path)
53
+ car_parts = get_car_parts(invoices)
54
+
55
+ car_parts_df = pd.DataFrame(car_parts, columns=["car_part", "cost", "hours"])
56
+ car_parts_df["cost"] = car_parts_df["cost"].astype(float)
57
+ car_parts_df["hours"] = car_parts_df["hours"].astype(float)
58
+ car_parts_df = car_parts_df.groupby("car_part").agg(
59
+ {"cost": ["mean", "min", "max", "count"], "hours": ["mean", "min", "max"]}
60
+ )
61
+ car_parts_df.columns = [
62
+ "average_cost", "cost_min", "cost_max", "count", "average_hours", "hours_min", "hours_max"
63
+ ]
64
+
65
+ car_parts_dict = car_parts_df.to_dict(
66
+ orient="index"
67
+ )
68
+
69
+ with open(path_to_save_json, "w", encoding="utf-8") as f:
70
+ json.dump(car_parts_dict, f, ensure_ascii=False, indent=4)
71
+
72
+ return car_parts_df