Spaces:

Innosphere-AI-organization
/

pixtral-demo

Paused

App Files Files Community

alexandraroze commited on Nov 29, 2024

Commit

1961add

1 Parent(s): c4c635a

added cost_estimation

Browse files

Files changed (1) hide show

cost_estimation/cost_estimation.py +72 -0

cost_estimation/cost_estimation.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import json
+import pandas as pd
+import os
+from pypdf import PdfReader
+import re
+from tqdm import tqdm
+def extract_invoice_tables(invoices_path: str) -> list[str]:
+    invoices_paths = [os.path.join(invoices_path, invoice) for invoice in os.listdir(invoices_path) if
+                      invoice.endswith(".pdf")]
+    invoices_tables = []
+    for invoice_path in tqdm(invoices_paths):
+        reader = PdfReader(invoice_path)
+        page = reader.pages[0]
+        text = page.extract_text()
+        table_text = re.search(r"Beschädigtes Teil.*?Gesamtsumme:.*?EUR", text, re.DOTALL).group()
+        lines = table_text.splitlines()
+        header = lines[0]
+        other_text = "\n".join(lines[1:])
+        cleaned_text = re.sub(r"(?<!\d)\n", " ", other_text)
+        table = header + "\n" + cleaned_text
+        inv = table.split("\n")
+        reformatted_inv = "Beschädigtes Teil | Teilkosten (EUR) | Arbeitsstunden | Arbeitskosten (EUR/Stunde) | Gesamtkosten (EUR)\n" + "\n".join(
+            " ".join(inv[i].split(" ")[:-4]) + " | " + ' | '.join(inv[i].split(" ")[-4:]) for i in
+            range(1, len(inv) - 1)) + "\n" + inv[-1]
+        invoices_tables.append(reformatted_inv)
+    return invoices_tables
+def get_car_parts(invoices: list[str]) -> list[tuple[str, str, str]]:
+    car_parts = []
+    for invoice in invoices:
+        car_parts += [
+            (
+                line.split(" | ")[0].replace("(rechts)", "").replace("(links)", "").strip(),
+                line.split(" | ")[1],
+                line.split(" | ")[2]
+            )
+            for line in invoice.split("\n")[1:-1]
+        ]
+    return car_parts
+def estimate_costs(invoices_folder_path: str, path_to_save_json: str) -> pd.DataFrame:
+    invoices = extract_invoice_tables(invoices_folder_path)
+    car_parts = get_car_parts(invoices)
+    car_parts_df = pd.DataFrame(car_parts, columns=["car_part", "cost", "hours"])
+    car_parts_df["cost"] = car_parts_df["cost"].astype(float)
+    car_parts_df["hours"] = car_parts_df["hours"].astype(float)
+    car_parts_df = car_parts_df.groupby("car_part").agg(
+        {"cost": ["mean", "min", "max", "count"], "hours": ["mean", "min", "max"]}
+    )
+    car_parts_df.columns = [
+        "average_cost", "cost_min", "cost_max", "count", "average_hours", "hours_min", "hours_max"
+    ]
+    car_parts_dict = car_parts_df.to_dict(
+        orient="index"
+    )
+    with open(path_to_save_json, "w", encoding="utf-8") as f:
+        json.dump(car_parts_dict, f, ensure_ascii=False, indent=4)
+    return car_parts_df