alexandraroze commited on
Commit
0ca755d
·
1 Parent(s): 398ae73

added json invoices

Browse files
Files changed (3) hide show
  1. app.py +2 -0
  2. src/RAG.py +46 -23
  3. src/pipelines.py +3 -1
app.py CHANGED
@@ -32,6 +32,7 @@ st.set_page_config(page_title="Invoice generator", layout="wide")
32
  output_folder = "output"
33
  data_folder = "data"
34
  template = "template.md"
 
35
  device = "cuda" if torch.cuda.is_available() else "cpu"
36
  with open(f"{data_folder}/car_parts.json", "r") as f:
37
  car_parts = json.load(f)
@@ -51,6 +52,7 @@ if "pipeline" not in st.session_state:
51
  path_to_template=f"{data_folder}/{template}",
52
  reranker_model="monovlm",
53
  device=device,
 
54
  gpu_memory_utilization=0.65
55
  )
56
  pipeline = st.session_state.pipeline
 
32
  output_folder = "output"
33
  data_folder = "data"
34
  template = "template.md"
35
+ invoice_json_path = f"{data_folder}/invoices.json"
36
  device = "cuda" if torch.cuda.is_available() else "cpu"
37
  with open(f"{data_folder}/car_parts.json", "r") as f:
38
  car_parts = json.load(f)
 
52
  path_to_template=f"{data_folder}/{template}",
53
  reranker_model="monovlm",
54
  device=device,
55
+ invoice_json_path=invoice_json_path,
56
  gpu_memory_utilization=0.65
57
  )
58
  pipeline = st.session_state.pipeline
src/RAG.py CHANGED
@@ -10,6 +10,7 @@ import torch
10
  import numpy as np
11
  from tqdm import tqdm
12
  import base64
 
13
 
14
 
15
  class RAG:
@@ -21,7 +22,8 @@ class RAG:
21
  device="cpu",
22
  image_invoice_index_path=None,
23
  path_to_invoices=None,
24
- path_to_images=None
 
25
  ):
26
  self.index = faiss.read_index(fais_index_path)
27
  self.model, self.preprocess = clip.load(clip_model, device=device)
@@ -31,6 +33,9 @@ class RAG:
31
  self.path_to_invoices = path_to_invoices
32
  self.path_to_images = path_to_images
33
  self.reranker = reranker
 
 
 
34
 
35
  @staticmethod
36
  def image_to_base64(image_path):
@@ -56,7 +61,45 @@ class RAG:
56
  distances, indices = self.index.search(image_features, k)
57
  return distances, indices
58
 
59
- def find_invoice(self, image=None, image_path=None, return_only_path=True, k=1, damage_description=None):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  if self.image_invoice_index is None:
61
  raise ValueError("No index for invoices found.")
62
  _, indices = self.search_image(image=image, image_path=image_path, k=k)
@@ -67,8 +110,6 @@ class RAG:
67
  if self.reranker:
68
  if damage_description is None:
69
  raise ValueError("Damage description must be provided.")
70
- # images = [self.image_to_base64(f"{self.path_to_images}/{img_path}") for img_path in images_paths]
71
-
72
  images = [f"{self.path_to_images}/{img_path}" for img_path in images_paths]
73
  results = self.reranker.rank(damage_description, images, doc_ids=invoices)
74
  invoices = [doc.doc_id for doc in results]
@@ -83,25 +124,7 @@ class RAG:
83
  invoices_tables = []
84
 
85
  for invoice in invoices:
86
- pdf_path = f"{self.path_to_invoices}/{invoice}"
87
- reader = PdfReader(pdf_path)
88
- page = reader.pages[0]
89
- text = page.extract_text()
90
-
91
- table_text = re.search(r"Beschädigtes Teil.*?Gesamtsumme:.*?EUR", text, re.DOTALL).group()
92
-
93
- lines = table_text.splitlines()
94
- header = lines[0]
95
- other_text = "\n".join(lines[1:])
96
- cleaned_text = re.sub(r"(?<!\d)\n", " ", other_text)
97
-
98
- table = header + "\n" + cleaned_text
99
-
100
- inv = table.split("\n")
101
- reformatted_inv = "Beschädigtes Teil | Teilkosten (EUR) | Arbeitsstunden | Arbeitskosten (EUR/Stunde) | Gesamtkosten (EUR)\n" + "\n".join(
102
- " ".join(inv[i].split(" ")[:-4]) + " | " + ' | '.join(inv[i].split(" ")[-4:]) for i in
103
- range(1, len(inv) - 1)) + "\n" + inv[-1]
104
-
105
  invoices_tables.append(reformatted_inv)
106
 
107
  return invoices_tables, invoices
 
10
  import numpy as np
11
  from tqdm import tqdm
12
  import base64
13
+ import json
14
 
15
 
16
  class RAG:
 
22
  device="cpu",
23
  image_invoice_index_path=None,
24
  path_to_invoices=None,
25
+ path_to_images=None,
26
+ path_to_invoice_json=None
27
  ):
28
  self.index = faiss.read_index(fais_index_path)
29
  self.model, self.preprocess = clip.load(clip_model, device=device)
 
33
  self.path_to_invoices = path_to_invoices
34
  self.path_to_images = path_to_images
35
  self.reranker = reranker
36
+ if path_to_invoice_json:
37
+ with open(path_to_invoice_json, "r") as f:
38
+ self.invoice_json = json.load(f)
39
 
40
  @staticmethod
41
  def image_to_base64(image_path):
 
61
  distances, indices = self.index.search(image_features, k)
62
  return distances, indices
63
 
64
+ def return_invoice_table(self, path=None, invoice_is_table=True):
65
+ if path is None and not invoice_is_table:
66
+ raise ValueError("Path to invoice must be provided.")
67
+ if self.invoice_json is None and invoice_is_table:
68
+ raise ValueError("Path to invoice json must be provided.")
69
+
70
+ if invoice_is_table:
71
+ return self.invoice_json[path]
72
+
73
+ pdf_path = f"{self.path_to_invoices}/{path}"
74
+ reader = PdfReader(pdf_path)
75
+ page = reader.pages[0]
76
+ text = page.extract_text()
77
+
78
+ table_text = re.search(r"Beschädigtes Teil.*?Gesamtsumme:.*?EUR", text, re.DOTALL).group()
79
+
80
+ lines = table_text.splitlines()
81
+ header = lines[0]
82
+ other_text = "\n".join(lines[1:])
83
+ cleaned_text = re.sub(r"(?<!\d)\n", " ", other_text)
84
+
85
+ table = header + "\n" + cleaned_text
86
+
87
+ inv = table.split("\n")
88
+ reformatted_inv = "Beschädigtes Teil | Teilkosten (EUR) | Arbeitsstunden | Arbeitskosten (EUR/Stunde) | Gesamtkosten (EUR)\n" + "\n".join(
89
+ " ".join(inv[i].split(" ")[:-4]) + " | " + ' | '.join(inv[i].split(" ")[-4:]) for i in
90
+ range(1, len(inv) - 1)) + "\n" + inv[-1]
91
+
92
+ return reformatted_inv
93
+
94
+ def find_invoice(
95
+ self,
96
+ image=None,
97
+ image_path=None,
98
+ return_only_path=True,
99
+ k=1,
100
+ damage_description=None,
101
+ invoice_is_table=True
102
+ ):
103
  if self.image_invoice_index is None:
104
  raise ValueError("No index for invoices found.")
105
  _, indices = self.search_image(image=image, image_path=image_path, k=k)
 
110
  if self.reranker:
111
  if damage_description is None:
112
  raise ValueError("Damage description must be provided.")
 
 
113
  images = [f"{self.path_to_images}/{img_path}" for img_path in images_paths]
114
  results = self.reranker.rank(damage_description, images, doc_ids=invoices)
115
  invoices = [doc.doc_id for doc in results]
 
124
  invoices_tables = []
125
 
126
  for invoice in invoices:
127
+ reformatted_inv = self.return_invoice_table(invoice, invoice_is_table)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  invoices_tables.append(reformatted_inv)
129
 
130
  return invoices_tables, invoices
src/pipelines.py CHANGED
@@ -18,6 +18,7 @@ class InvoiceGenerator:
18
  path_to_template,
19
  reranker_model=None,
20
  device="cuda",
 
21
  max_model_len=4096, max_tokens=2048, gpu_memory_utilization=0.95
22
  ):
23
  self.model = Pixtral(max_model_len=max_model_len, max_tokens=max_tokens,
@@ -31,7 +32,8 @@ class InvoiceGenerator:
31
  image_invoice_index_path=image_invoice_index_path,
32
  path_to_invoices=path_to_invoices,
33
  path_to_images=path_to_images,
34
- reranker=self.reranker
 
35
  )
36
  self.path_to_invoices = path_to_invoices
37
  self.path_to_images = path_to_images
 
18
  path_to_template,
19
  reranker_model=None,
20
  device="cuda",
21
+ invoice_json_path=None,
22
  max_model_len=4096, max_tokens=2048, gpu_memory_utilization=0.95
23
  ):
24
  self.model = Pixtral(max_model_len=max_model_len, max_tokens=max_tokens,
 
32
  image_invoice_index_path=image_invoice_index_path,
33
  path_to_invoices=path_to_invoices,
34
  path_to_images=path_to_images,
35
+ reranker=self.reranker,
36
+ path_to_invoice_json=invoice_json_path
37
  )
38
  self.path_to_invoices = path_to_invoices
39
  self.path_to_images = path_to_images