alexandraroze commited on
Commit
50d43ea
·
1 Parent(s): c03c87c

updated cost estimation

Browse files
Files changed (1) hide show
  1. cost_estimation/cost_estimation.py +57 -5
cost_estimation/cost_estimation.py CHANGED
@@ -4,6 +4,10 @@ import os
4
  from pypdf import PdfReader
5
  import re
6
  from tqdm import tqdm
 
 
 
 
7
 
8
 
9
  def extract_invoice_tables(invoices_path: str) -> list[str]:
@@ -34,12 +38,33 @@ def extract_invoice_tables(invoices_path: str) -> list[str]:
34
  return invoices_tables
35
 
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  def get_car_parts(invoices: list[str]) -> list[tuple[str, str, str]]:
38
  car_parts = []
39
  for invoice in invoices:
40
  car_parts += [
41
  (
42
- line.split(" | ")[0].replace("(rechts)", "").replace("(links)", "").strip(),
43
  line.split(" | ")[1],
44
  line.split(" | ")[2]
45
  )
@@ -48,25 +73,52 @@ def get_car_parts(invoices: list[str]) -> list[tuple[str, str, str]]:
48
  return car_parts
49
 
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  def estimate_costs(invoices_folder_path: str, path_to_save_json: str) -> pd.DataFrame:
52
  invoices = extract_invoice_tables(invoices_folder_path)
53
  car_parts = get_car_parts(invoices)
54
 
55
  car_parts_df = pd.DataFrame(car_parts, columns=["car_part", "cost", "hours"])
 
 
 
 
 
 
56
  car_parts_df["cost"] = car_parts_df["cost"].astype(float)
57
  car_parts_df["hours"] = car_parts_df["hours"].astype(float)
58
- car_parts_df = car_parts_df.groupby("car_part").agg(
59
  {"cost": ["mean", "min", "max", "count"], "hours": ["mean", "min", "max"]}
60
  )
61
- car_parts_df.columns = [
62
  "average_cost", "cost_min", "cost_max", "count", "average_hours", "hours_min", "hours_max"
63
  ]
64
 
65
- car_parts_dict = car_parts_df.to_dict(
66
  orient="index"
67
  )
68
 
69
  with open(path_to_save_json, "w", encoding="utf-8") as f:
70
  json.dump(car_parts_dict, f, ensure_ascii=False, indent=4)
71
 
72
- return car_parts_df
 
4
  from pypdf import PdfReader
5
  import re
6
  from tqdm import tqdm
7
+ from openai import AzureOpenAI, Client
8
+ from prompts import ESTIMATE_COST_OF_CAR_PARTS_REPLACEMENT_PROMPT
9
+ import ast
10
+ import time
11
 
12
 
13
  def extract_invoice_tables(invoices_path: str) -> list[str]:
 
38
  return invoices_tables
39
 
40
 
41
+ def generate_response(client: Client, car_parts: list[str]):
42
+
43
+ response = client.chat.completions.create(
44
+ model="gpt-4o",
45
+ messages=[
46
+ {
47
+ "role": "user",
48
+ "content": [
49
+ {"type": "text", "text": ESTIMATE_COST_OF_CAR_PARTS_REPLACEMENT_PROMPT("\n".join(car_parts))},
50
+ ],
51
+ }
52
+ ],
53
+ max_tokens=1024,
54
+ temperature=0.5,
55
+ )
56
+
57
+ message = response.choices[0].message.content
58
+ message = message.replace("```python", "").replace("```", "")
59
+ return message
60
+
61
+
62
  def get_car_parts(invoices: list[str]) -> list[tuple[str, str, str]]:
63
  car_parts = []
64
  for invoice in invoices:
65
  car_parts += [
66
  (
67
+ line.split(" | ")[0].replace("(rechts)", "").replace("(links)", "").replace("links", "").replace("rechts", "").strip(),
68
  line.split(" | ")[1],
69
  line.split(" | ")[2]
70
  )
 
73
  return car_parts
74
 
75
 
76
+ def prompt_model_for_cost_estimation(
77
+ parts_to_estimate: list[str],
78
+ batch_size: int = 100
79
+ ) -> list[tuple[str, float, float]]:
80
+ car_parts = []
81
+ client = AzureOpenAI(
82
+ api_key=os.environ["AZURE_API_KEY"],
83
+ api_version=os.environ["AZURE_API_VERSION"],
84
+ azure_endpoint=os.environ["AZURE_ENDPOINT"],
85
+ )
86
+ for i in tqdm(range(0, len(parts_to_estimate), batch_size)):
87
+ try:
88
+ batch = parts_to_estimate[i:i + batch_size]
89
+ response = generate_response(client, batch)
90
+ car_parts.extend(ast.literal_eval(response))
91
+ time.sleep(5)
92
+ except Exception as e:
93
+ print(f"Error: {e}")
94
+ return car_parts
95
+
96
+
97
  def estimate_costs(invoices_folder_path: str, path_to_save_json: str) -> pd.DataFrame:
98
  invoices = extract_invoice_tables(invoices_folder_path)
99
  car_parts = get_car_parts(invoices)
100
 
101
  car_parts_df = pd.DataFrame(car_parts, columns=["car_part", "cost", "hours"])
102
+ parts_to_estimate = car_parts_df["car_part"].unique().tolist()
103
+ for i in range(3):
104
+ car_parts += prompt_model_for_cost_estimation(parts_to_estimate)
105
+
106
+ car_parts_df = pd.DataFrame(car_parts, columns=["car_part", "cost", "hours"])
107
+
108
  car_parts_df["cost"] = car_parts_df["cost"].astype(float)
109
  car_parts_df["hours"] = car_parts_df["hours"].astype(float)
110
+ car_parts_df_grouped = car_parts_df.groupby("car_part").agg(
111
  {"cost": ["mean", "min", "max", "count"], "hours": ["mean", "min", "max"]}
112
  )
113
+ car_parts_df_grouped.columns = [
114
  "average_cost", "cost_min", "cost_max", "count", "average_hours", "hours_min", "hours_max"
115
  ]
116
 
117
+ car_parts_dict = car_parts_df_grouped.to_dict(
118
  orient="index"
119
  )
120
 
121
  with open(path_to_save_json, "w", encoding="utf-8") as f:
122
  json.dump(car_parts_dict, f, ensure_ascii=False, indent=4)
123
 
124
+ return car_parts_df_grouped