Commit
·
50d43ea
1
Parent(s):
c03c87c
updated cost estimation
Browse files
cost_estimation/cost_estimation.py
CHANGED
@@ -4,6 +4,10 @@ import os
|
|
4 |
from pypdf import PdfReader
|
5 |
import re
|
6 |
from tqdm import tqdm
|
|
|
|
|
|
|
|
|
7 |
|
8 |
|
9 |
def extract_invoice_tables(invoices_path: str) -> list[str]:
|
@@ -34,12 +38,33 @@ def extract_invoice_tables(invoices_path: str) -> list[str]:
|
|
34 |
return invoices_tables
|
35 |
|
36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
def get_car_parts(invoices: list[str]) -> list[tuple[str, str, str]]:
|
38 |
car_parts = []
|
39 |
for invoice in invoices:
|
40 |
car_parts += [
|
41 |
(
|
42 |
-
line.split(" | ")[0].replace("(rechts)", "").replace("(links)", "").strip(),
|
43 |
line.split(" | ")[1],
|
44 |
line.split(" | ")[2]
|
45 |
)
|
@@ -48,25 +73,52 @@ def get_car_parts(invoices: list[str]) -> list[tuple[str, str, str]]:
|
|
48 |
return car_parts
|
49 |
|
50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
def estimate_costs(invoices_folder_path: str, path_to_save_json: str) -> pd.DataFrame:
|
52 |
invoices = extract_invoice_tables(invoices_folder_path)
|
53 |
car_parts = get_car_parts(invoices)
|
54 |
|
55 |
car_parts_df = pd.DataFrame(car_parts, columns=["car_part", "cost", "hours"])
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
car_parts_df["cost"] = car_parts_df["cost"].astype(float)
|
57 |
car_parts_df["hours"] = car_parts_df["hours"].astype(float)
|
58 |
-
|
59 |
{"cost": ["mean", "min", "max", "count"], "hours": ["mean", "min", "max"]}
|
60 |
)
|
61 |
-
|
62 |
"average_cost", "cost_min", "cost_max", "count", "average_hours", "hours_min", "hours_max"
|
63 |
]
|
64 |
|
65 |
-
car_parts_dict =
|
66 |
orient="index"
|
67 |
)
|
68 |
|
69 |
with open(path_to_save_json, "w", encoding="utf-8") as f:
|
70 |
json.dump(car_parts_dict, f, ensure_ascii=False, indent=4)
|
71 |
|
72 |
-
return
|
|
|
4 |
from pypdf import PdfReader
|
5 |
import re
|
6 |
from tqdm import tqdm
|
7 |
+
from openai import AzureOpenAI, Client
|
8 |
+
from prompts import ESTIMATE_COST_OF_CAR_PARTS_REPLACEMENT_PROMPT
|
9 |
+
import ast
|
10 |
+
import time
|
11 |
|
12 |
|
13 |
def extract_invoice_tables(invoices_path: str) -> list[str]:
|
|
|
38 |
return invoices_tables
|
39 |
|
40 |
|
41 |
+
def generate_response(client: Client, car_parts: list[str]):
|
42 |
+
|
43 |
+
response = client.chat.completions.create(
|
44 |
+
model="gpt-4o",
|
45 |
+
messages=[
|
46 |
+
{
|
47 |
+
"role": "user",
|
48 |
+
"content": [
|
49 |
+
{"type": "text", "text": ESTIMATE_COST_OF_CAR_PARTS_REPLACEMENT_PROMPT("\n".join(car_parts))},
|
50 |
+
],
|
51 |
+
}
|
52 |
+
],
|
53 |
+
max_tokens=1024,
|
54 |
+
temperature=0.5,
|
55 |
+
)
|
56 |
+
|
57 |
+
message = response.choices[0].message.content
|
58 |
+
message = message.replace("```python", "").replace("```", "")
|
59 |
+
return message
|
60 |
+
|
61 |
+
|
62 |
def get_car_parts(invoices: list[str]) -> list[tuple[str, str, str]]:
|
63 |
car_parts = []
|
64 |
for invoice in invoices:
|
65 |
car_parts += [
|
66 |
(
|
67 |
+
line.split(" | ")[0].replace("(rechts)", "").replace("(links)", "").replace("links", "").replace("rechts", "").strip(),
|
68 |
line.split(" | ")[1],
|
69 |
line.split(" | ")[2]
|
70 |
)
|
|
|
73 |
return car_parts
|
74 |
|
75 |
|
76 |
+
def prompt_model_for_cost_estimation(
|
77 |
+
parts_to_estimate: list[str],
|
78 |
+
batch_size: int = 100
|
79 |
+
) -> list[tuple[str, float, float]]:
|
80 |
+
car_parts = []
|
81 |
+
client = AzureOpenAI(
|
82 |
+
api_key=os.environ["AZURE_API_KEY"],
|
83 |
+
api_version=os.environ["AZURE_API_VERSION"],
|
84 |
+
azure_endpoint=os.environ["AZURE_ENDPOINT"],
|
85 |
+
)
|
86 |
+
for i in tqdm(range(0, len(parts_to_estimate), batch_size)):
|
87 |
+
try:
|
88 |
+
batch = parts_to_estimate[i:i + batch_size]
|
89 |
+
response = generate_response(client, batch)
|
90 |
+
car_parts.extend(ast.literal_eval(response))
|
91 |
+
time.sleep(5)
|
92 |
+
except Exception as e:
|
93 |
+
print(f"Error: {e}")
|
94 |
+
return car_parts
|
95 |
+
|
96 |
+
|
97 |
def estimate_costs(invoices_folder_path: str, path_to_save_json: str) -> pd.DataFrame:
|
98 |
invoices = extract_invoice_tables(invoices_folder_path)
|
99 |
car_parts = get_car_parts(invoices)
|
100 |
|
101 |
car_parts_df = pd.DataFrame(car_parts, columns=["car_part", "cost", "hours"])
|
102 |
+
parts_to_estimate = car_parts_df["car_part"].unique().tolist()
|
103 |
+
for i in range(3):
|
104 |
+
car_parts += prompt_model_for_cost_estimation(parts_to_estimate)
|
105 |
+
|
106 |
+
car_parts_df = pd.DataFrame(car_parts, columns=["car_part", "cost", "hours"])
|
107 |
+
|
108 |
car_parts_df["cost"] = car_parts_df["cost"].astype(float)
|
109 |
car_parts_df["hours"] = car_parts_df["hours"].astype(float)
|
110 |
+
car_parts_df_grouped = car_parts_df.groupby("car_part").agg(
|
111 |
{"cost": ["mean", "min", "max", "count"], "hours": ["mean", "min", "max"]}
|
112 |
)
|
113 |
+
car_parts_df_grouped.columns = [
|
114 |
"average_cost", "cost_min", "cost_max", "count", "average_hours", "hours_min", "hours_max"
|
115 |
]
|
116 |
|
117 |
+
car_parts_dict = car_parts_df_grouped.to_dict(
|
118 |
orient="index"
|
119 |
)
|
120 |
|
121 |
with open(path_to_save_json, "w", encoding="utf-8") as f:
|
122 |
json.dump(car_parts_dict, f, ensure_ascii=False, indent=4)
|
123 |
|
124 |
+
return car_parts_df_grouped
|