Spaces:

Innosphere-AI-organization
/

pixtral-demo

Paused

App Files Files Community

pixtral-demo / cost_estimation /cost_estimation.py

alexandraroze

updated cost estimation

50d43ea 7 months ago

raw

history blame contribute delete

4.28 kB

	import json
	import pandas as pd
	import os
	from pypdf import PdfReader
	import re
	from tqdm import tqdm
	from openai import AzureOpenAI, Client
	from prompts import ESTIMATE_COST_OF_CAR_PARTS_REPLACEMENT_PROMPT
	import ast
	import time


	def extract_invoice_tables(invoices_path: str) -> list[str]:
	invoices_paths = [os.path.join(invoices_path, invoice) for invoice in os.listdir(invoices_path) if
	invoice.endswith(".pdf")]
	invoices_tables = []
	for invoice_path in tqdm(invoices_paths):
	reader = PdfReader(invoice_path)
	page = reader.pages[0]
	text = page.extract_text()

	table_text = re.search(r"Beschädigtes Teil.?Gesamtsumme:.?EUR", text, re.DOTALL).group()

	lines = table_text.splitlines()
	header = lines[0]
	other_text = "\n".join(lines[1:])
	cleaned_text = re.sub(r"(?<!\d)\n", " ", other_text)

	table = header + "\n" + cleaned_text

	inv = table.split("\n")
	reformatted_inv = "Beschädigtes Teil \| Teilkosten (EUR) \| Arbeitsstunden \| Arbeitskosten (EUR/Stunde) \| Gesamtkosten (EUR)\n" + "\n".join(
	" ".join(inv[i].split(" ")[:-4]) + " \| " + ' \| '.join(inv[i].split(" ")[-4:]) for i in
	range(1, len(inv) - 1)) + "\n" + inv[-1]

	invoices_tables.append(reformatted_inv)

	return invoices_tables


	def generate_response(client: Client, car_parts: list[str]):

	response = client.chat.completions.create(
	model="gpt-4o",
	messages=[
	{
	"role": "user",
	"content": [
	{"type": "text", "text": ESTIMATE_COST_OF_CAR_PARTS_REPLACEMENT_PROMPT("\n".join(car_parts))},
	],
	}
	],
	max_tokens=1024,
	temperature=0.5,
	)

	message = response.choices[0].message.content
	message = message.replace("```python", "").replace("```", "")
	return message


	def get_car_parts(invoices: list[str]) -> list[tuple[str, str, str]]:
	car_parts = []
	for invoice in invoices:
	car_parts += [
	(
	line.split(" \| ")[0].replace("(rechts)", "").replace("(links)", "").replace("links", "").replace("rechts", "").strip(),
	line.split(" \| ")[1],
	line.split(" \| ")[2]
	)
	for line in invoice.split("\n")[1:-1]
	]
	return car_parts


	def prompt_model_for_cost_estimation(
	parts_to_estimate: list[str],
	batch_size: int = 100
	) -> list[tuple[str, float, float]]:
	car_parts = []
	client = AzureOpenAI(
	api_key=os.environ["AZURE_API_KEY"],
	api_version=os.environ["AZURE_API_VERSION"],
	azure_endpoint=os.environ["AZURE_ENDPOINT"],
	)
	for i in tqdm(range(0, len(parts_to_estimate), batch_size)):
	try:
	batch = parts_to_estimate[i:i + batch_size]
	response = generate_response(client, batch)
	car_parts.extend(ast.literal_eval(response))
	time.sleep(5)
	except Exception as e:
	print(f"Error: {e}")
	return car_parts


	def estimate_costs(invoices_folder_path: str, path_to_save_json: str) -> pd.DataFrame:
	invoices = extract_invoice_tables(invoices_folder_path)
	car_parts = get_car_parts(invoices)

	car_parts_df = pd.DataFrame(car_parts, columns=["car_part", "cost", "hours"])
	parts_to_estimate = car_parts_df["car_part"].unique().tolist()
	for i in range(3):
	car_parts += prompt_model_for_cost_estimation(parts_to_estimate)

	car_parts_df = pd.DataFrame(car_parts, columns=["car_part", "cost", "hours"])

	car_parts_df["cost"] = car_parts_df["cost"].astype(float)
	car_parts_df["hours"] = car_parts_df["hours"].astype(float)
	car_parts_df_grouped = car_parts_df.groupby("car_part").agg(
	{"cost": ["mean", "min", "max", "count"], "hours": ["mean", "min", "max"]}
	)
	car_parts_df_grouped.columns = [
	"average_cost", "cost_min", "cost_max", "count", "average_hours", "hours_min", "hours_max"
	]

	car_parts_dict = car_parts_df_grouped.to_dict(
	orient="index"
	)

	with open(path_to_save_json, "w", encoding="utf-8") as f:
	json.dump(car_parts_dict, f, ensure_ascii=False, indent=4)

	return car_parts_df_grouped