Commit
·
0ca755d
1
Parent(s):
398ae73
added json invoices
Browse files- app.py +2 -0
- src/RAG.py +46 -23
- src/pipelines.py +3 -1
app.py
CHANGED
@@ -32,6 +32,7 @@ st.set_page_config(page_title="Invoice generator", layout="wide")
|
|
32 |
output_folder = "output"
|
33 |
data_folder = "data"
|
34 |
template = "template.md"
|
|
|
35 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
36 |
with open(f"{data_folder}/car_parts.json", "r") as f:
|
37 |
car_parts = json.load(f)
|
@@ -51,6 +52,7 @@ if "pipeline" not in st.session_state:
|
|
51 |
path_to_template=f"{data_folder}/{template}",
|
52 |
reranker_model="monovlm",
|
53 |
device=device,
|
|
|
54 |
gpu_memory_utilization=0.65
|
55 |
)
|
56 |
pipeline = st.session_state.pipeline
|
|
|
32 |
output_folder = "output"
|
33 |
data_folder = "data"
|
34 |
template = "template.md"
|
35 |
+
invoice_json_path = f"{data_folder}/invoices.json"
|
36 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
37 |
with open(f"{data_folder}/car_parts.json", "r") as f:
|
38 |
car_parts = json.load(f)
|
|
|
52 |
path_to_template=f"{data_folder}/{template}",
|
53 |
reranker_model="monovlm",
|
54 |
device=device,
|
55 |
+
invoice_json_path=invoice_json_path,
|
56 |
gpu_memory_utilization=0.65
|
57 |
)
|
58 |
pipeline = st.session_state.pipeline
|
src/RAG.py
CHANGED
@@ -10,6 +10,7 @@ import torch
|
|
10 |
import numpy as np
|
11 |
from tqdm import tqdm
|
12 |
import base64
|
|
|
13 |
|
14 |
|
15 |
class RAG:
|
@@ -21,7 +22,8 @@ class RAG:
|
|
21 |
device="cpu",
|
22 |
image_invoice_index_path=None,
|
23 |
path_to_invoices=None,
|
24 |
-
path_to_images=None
|
|
|
25 |
):
|
26 |
self.index = faiss.read_index(fais_index_path)
|
27 |
self.model, self.preprocess = clip.load(clip_model, device=device)
|
@@ -31,6 +33,9 @@ class RAG:
|
|
31 |
self.path_to_invoices = path_to_invoices
|
32 |
self.path_to_images = path_to_images
|
33 |
self.reranker = reranker
|
|
|
|
|
|
|
34 |
|
35 |
@staticmethod
|
36 |
def image_to_base64(image_path):
|
@@ -56,7 +61,45 @@ class RAG:
|
|
56 |
distances, indices = self.index.search(image_features, k)
|
57 |
return distances, indices
|
58 |
|
59 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
if self.image_invoice_index is None:
|
61 |
raise ValueError("No index for invoices found.")
|
62 |
_, indices = self.search_image(image=image, image_path=image_path, k=k)
|
@@ -67,8 +110,6 @@ class RAG:
|
|
67 |
if self.reranker:
|
68 |
if damage_description is None:
|
69 |
raise ValueError("Damage description must be provided.")
|
70 |
-
# images = [self.image_to_base64(f"{self.path_to_images}/{img_path}") for img_path in images_paths]
|
71 |
-
|
72 |
images = [f"{self.path_to_images}/{img_path}" for img_path in images_paths]
|
73 |
results = self.reranker.rank(damage_description, images, doc_ids=invoices)
|
74 |
invoices = [doc.doc_id for doc in results]
|
@@ -83,25 +124,7 @@ class RAG:
|
|
83 |
invoices_tables = []
|
84 |
|
85 |
for invoice in invoices:
|
86 |
-
|
87 |
-
reader = PdfReader(pdf_path)
|
88 |
-
page = reader.pages[0]
|
89 |
-
text = page.extract_text()
|
90 |
-
|
91 |
-
table_text = re.search(r"Beschädigtes Teil.*?Gesamtsumme:.*?EUR", text, re.DOTALL).group()
|
92 |
-
|
93 |
-
lines = table_text.splitlines()
|
94 |
-
header = lines[0]
|
95 |
-
other_text = "\n".join(lines[1:])
|
96 |
-
cleaned_text = re.sub(r"(?<!\d)\n", " ", other_text)
|
97 |
-
|
98 |
-
table = header + "\n" + cleaned_text
|
99 |
-
|
100 |
-
inv = table.split("\n")
|
101 |
-
reformatted_inv = "Beschädigtes Teil | Teilkosten (EUR) | Arbeitsstunden | Arbeitskosten (EUR/Stunde) | Gesamtkosten (EUR)\n" + "\n".join(
|
102 |
-
" ".join(inv[i].split(" ")[:-4]) + " | " + ' | '.join(inv[i].split(" ")[-4:]) for i in
|
103 |
-
range(1, len(inv) - 1)) + "\n" + inv[-1]
|
104 |
-
|
105 |
invoices_tables.append(reformatted_inv)
|
106 |
|
107 |
return invoices_tables, invoices
|
|
|
10 |
import numpy as np
|
11 |
from tqdm import tqdm
|
12 |
import base64
|
13 |
+
import json
|
14 |
|
15 |
|
16 |
class RAG:
|
|
|
22 |
device="cpu",
|
23 |
image_invoice_index_path=None,
|
24 |
path_to_invoices=None,
|
25 |
+
path_to_images=None,
|
26 |
+
path_to_invoice_json=None
|
27 |
):
|
28 |
self.index = faiss.read_index(fais_index_path)
|
29 |
self.model, self.preprocess = clip.load(clip_model, device=device)
|
|
|
33 |
self.path_to_invoices = path_to_invoices
|
34 |
self.path_to_images = path_to_images
|
35 |
self.reranker = reranker
|
36 |
+
if path_to_invoice_json:
|
37 |
+
with open(path_to_invoice_json, "r") as f:
|
38 |
+
self.invoice_json = json.load(f)
|
39 |
|
40 |
@staticmethod
|
41 |
def image_to_base64(image_path):
|
|
|
61 |
distances, indices = self.index.search(image_features, k)
|
62 |
return distances, indices
|
63 |
|
64 |
+
def return_invoice_table(self, path=None, invoice_is_table=True):
|
65 |
+
if path is None and not invoice_is_table:
|
66 |
+
raise ValueError("Path to invoice must be provided.")
|
67 |
+
if self.invoice_json is None and invoice_is_table:
|
68 |
+
raise ValueError("Path to invoice json must be provided.")
|
69 |
+
|
70 |
+
if invoice_is_table:
|
71 |
+
return self.invoice_json[path]
|
72 |
+
|
73 |
+
pdf_path = f"{self.path_to_invoices}/{path}"
|
74 |
+
reader = PdfReader(pdf_path)
|
75 |
+
page = reader.pages[0]
|
76 |
+
text = page.extract_text()
|
77 |
+
|
78 |
+
table_text = re.search(r"Beschädigtes Teil.*?Gesamtsumme:.*?EUR", text, re.DOTALL).group()
|
79 |
+
|
80 |
+
lines = table_text.splitlines()
|
81 |
+
header = lines[0]
|
82 |
+
other_text = "\n".join(lines[1:])
|
83 |
+
cleaned_text = re.sub(r"(?<!\d)\n", " ", other_text)
|
84 |
+
|
85 |
+
table = header + "\n" + cleaned_text
|
86 |
+
|
87 |
+
inv = table.split("\n")
|
88 |
+
reformatted_inv = "Beschädigtes Teil | Teilkosten (EUR) | Arbeitsstunden | Arbeitskosten (EUR/Stunde) | Gesamtkosten (EUR)\n" + "\n".join(
|
89 |
+
" ".join(inv[i].split(" ")[:-4]) + " | " + ' | '.join(inv[i].split(" ")[-4:]) for i in
|
90 |
+
range(1, len(inv) - 1)) + "\n" + inv[-1]
|
91 |
+
|
92 |
+
return reformatted_inv
|
93 |
+
|
94 |
+
def find_invoice(
|
95 |
+
self,
|
96 |
+
image=None,
|
97 |
+
image_path=None,
|
98 |
+
return_only_path=True,
|
99 |
+
k=1,
|
100 |
+
damage_description=None,
|
101 |
+
invoice_is_table=True
|
102 |
+
):
|
103 |
if self.image_invoice_index is None:
|
104 |
raise ValueError("No index for invoices found.")
|
105 |
_, indices = self.search_image(image=image, image_path=image_path, k=k)
|
|
|
110 |
if self.reranker:
|
111 |
if damage_description is None:
|
112 |
raise ValueError("Damage description must be provided.")
|
|
|
|
|
113 |
images = [f"{self.path_to_images}/{img_path}" for img_path in images_paths]
|
114 |
results = self.reranker.rank(damage_description, images, doc_ids=invoices)
|
115 |
invoices = [doc.doc_id for doc in results]
|
|
|
124 |
invoices_tables = []
|
125 |
|
126 |
for invoice in invoices:
|
127 |
+
reformatted_inv = self.return_invoice_table(invoice, invoice_is_table)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
invoices_tables.append(reformatted_inv)
|
129 |
|
130 |
return invoices_tables, invoices
|
src/pipelines.py
CHANGED
@@ -18,6 +18,7 @@ class InvoiceGenerator:
|
|
18 |
path_to_template,
|
19 |
reranker_model=None,
|
20 |
device="cuda",
|
|
|
21 |
max_model_len=4096, max_tokens=2048, gpu_memory_utilization=0.95
|
22 |
):
|
23 |
self.model = Pixtral(max_model_len=max_model_len, max_tokens=max_tokens,
|
@@ -31,7 +32,8 @@ class InvoiceGenerator:
|
|
31 |
image_invoice_index_path=image_invoice_index_path,
|
32 |
path_to_invoices=path_to_invoices,
|
33 |
path_to_images=path_to_images,
|
34 |
-
reranker=self.reranker
|
|
|
35 |
)
|
36 |
self.path_to_invoices = path_to_invoices
|
37 |
self.path_to_images = path_to_images
|
|
|
18 |
path_to_template,
|
19 |
reranker_model=None,
|
20 |
device="cuda",
|
21 |
+
invoice_json_path=None,
|
22 |
max_model_len=4096, max_tokens=2048, gpu_memory_utilization=0.95
|
23 |
):
|
24 |
self.model = Pixtral(max_model_len=max_model_len, max_tokens=max_tokens,
|
|
|
32 |
image_invoice_index_path=image_invoice_index_path,
|
33 |
path_to_invoices=path_to_invoices,
|
34 |
path_to_images=path_to_images,
|
35 |
+
reranker=self.reranker,
|
36 |
+
path_to_invoice_json=invoice_json_path
|
37 |
)
|
38 |
self.path_to_invoices = path_to_invoices
|
39 |
self.path_to_images = path_to_images
|