|
import clip |
|
import faiss |
|
from PIL import Image |
|
from pypdf import PdfReader |
|
import pandas as pd |
|
import re |
|
import os |
|
import fitz |
|
import torch |
|
import numpy as np |
|
from tqdm import tqdm |
|
import base64 |
|
import json |
|
|
|
|
|
class RAG: |
|
def __init__( |
|
self, |
|
fais_index_path, |
|
clip_model="ViT-B/32", |
|
reranker=None, |
|
device="cpu", |
|
image_invoice_index_path=None, |
|
path_to_invoices=None, |
|
path_to_images=None, |
|
path_to_invoice_json=None |
|
): |
|
self.index = faiss.read_index(fais_index_path) |
|
self.model, self.preprocess = clip.load(clip_model, device=device) |
|
self.device = device |
|
if image_invoice_index_path: |
|
self.image_invoice_index = pd.read_csv(image_invoice_index_path) |
|
self.path_to_invoices = path_to_invoices |
|
self.path_to_images = path_to_images |
|
self.reranker = reranker |
|
self.invoice_json = None |
|
self.invoice_json_granular = None |
|
if path_to_invoice_json: |
|
if type(path_to_invoice_json) == str: |
|
with open(path_to_invoice_json, "r") as f: |
|
self.invoice_json = json.load(f) |
|
elif type(path_to_invoice_json) == dict and set(list(path_to_invoice_json.keys())) == {"invoices", "invoices_granular"}: |
|
with open(path_to_invoice_json["invoices"], "r") as f: |
|
self.invoice_json = json.load(f) |
|
with open(path_to_invoice_json["invoices_granular"], "r") as f: |
|
self.invoice_json_granular = json.load(f) |
|
else: |
|
raise ValueError("Invalid format for invoice json.") |
|
|
|
|
|
@staticmethod |
|
def image_to_base64(image_path): |
|
with open(image_path, "rb") as image_file: |
|
return base64.b64encode(image_file.read()) |
|
|
|
def search_text(self, text, k=1): |
|
text_features = self.model.encode_text(clip.tokenize([text]).to(self.device)) |
|
text_features /= text_features.norm(dim=-1, keepdim=True) |
|
text_features = text_features.detach().numpy() |
|
distances, indices = self.index.search(text_features, k) |
|
return distances, indices |
|
|
|
def search_image(self, image=None, image_path=None, k=1): |
|
if image is None and image_path is None: |
|
raise ValueError("Either image or image_path must be provided.") |
|
if image is None: |
|
image = Image.open(image_path) |
|
image_input = self.preprocess(image).unsqueeze(0).to(self.device) |
|
image_features = self.model.encode_image(image_input) |
|
image_features /= image_features.norm(dim=-1, keepdim=True) |
|
image_features = image_features.detach().numpy() |
|
distances, indices = self.index.search(image_features, k) |
|
return distances, indices |
|
|
|
def return_invoice_table(self, path=None, invoice_is_table=True, use_granular_invoice=False): |
|
if path is None and not invoice_is_table: |
|
raise ValueError("Path to invoice must be provided.") |
|
if self.invoice_json is None and invoice_is_table: |
|
raise ValueError("Path to invoice json must be provided.") |
|
if self.invoice_json_granular is None and use_granular_invoice: |
|
raise ValueError("Path to granular invoice json must be provided.") |
|
|
|
if invoice_is_table and not use_granular_invoice: |
|
return self.invoice_json[path] |
|
elif invoice_is_table and use_granular_invoice: |
|
return self.invoice_json_granular[path] |
|
|
|
pdf_path = f"{self.path_to_invoices}/{path}" |
|
reader = PdfReader(pdf_path) |
|
page = reader.pages[0] |
|
text = page.extract_text() |
|
|
|
table_text = re.search(r"Beschädigtes Teil.*?Gesamtsumme:.*?EUR", text, re.DOTALL).group() |
|
|
|
lines = table_text.splitlines() |
|
header = lines[0] |
|
other_text = "\n".join(lines[1:]) |
|
cleaned_text = re.sub(r"(?<!\d)\n", " ", other_text) |
|
|
|
table = header + "\n" + cleaned_text |
|
|
|
inv = table.split("\n") |
|
reformatted_inv = "Beschädigtes Teil | Teilkosten (EUR) | Arbeitsstunden | Arbeitskosten (EUR/Stunde) | Gesamtkosten (EUR)\n" + "\n".join( |
|
" ".join(inv[i].split(" ")[:-4]) + " | " + ' | '.join(inv[i].split(" ")[-4:]) for i in |
|
range(1, len(inv) - 1)) + "\n" + inv[-1] |
|
|
|
return reformatted_inv |
|
|
|
def find_invoice( |
|
self, |
|
image=None, |
|
image_path=None, |
|
return_only_path=True, |
|
k=1, |
|
damage_description=None, |
|
invoice_is_table=True, |
|
use_granular_invoice=False |
|
): |
|
if self.image_invoice_index is None: |
|
raise ValueError("No index for invoices found.") |
|
_, indices = self.search_image(image=image, image_path=image_path, k=k) |
|
img_ids = self.image_invoice_index.iloc[indices[0]]['img_id'].values |
|
invoices = self.image_invoice_index[self.image_invoice_index['img_id'].isin(img_ids)]['invoice'].values.tolist() |
|
images_paths = self.image_invoice_index[self.image_invoice_index['img_id'].isin(img_ids)]['image'].values.tolist() |
|
|
|
if self.reranker: |
|
if damage_description is None: |
|
raise ValueError("Damage description must be provided.") |
|
images = [f"{self.path_to_images}/{img_path}" for img_path in images_paths] |
|
results = self.reranker.rank(damage_description, images, doc_ids=invoices) |
|
invoices = [doc.doc_id for doc in results] |
|
print(invoices) |
|
|
|
if return_only_path: |
|
return invoices, images_paths |
|
|
|
if not self.path_to_invoices: |
|
raise ValueError("Path to data must be provided.") |
|
|
|
invoices_tables = [] |
|
|
|
for invoice in invoices: |
|
reformatted_inv = self.return_invoice_table( |
|
invoice, invoice_is_table, use_granular_invoice=use_granular_invoice |
|
) |
|
invoices_tables.append(reformatted_inv) |
|
|
|
return invoices_tables, invoices |
|
|
|
|
|
def build_rag(directory): |
|
invoices = os.listdir(f"{directory}/invoices_validated") |
|
invoices = [i for i in invoices if i.endswith(".pdf")] |
|
|
|
image_invoice = [] |
|
os.makedirs(f"{directory}/images", exist_ok=True) |
|
os.makedirs(f"{directory}/invoices", exist_ok=True) |
|
|
|
for invoice in invoices: |
|
doc = fitz.open(f"{directory}/invoices_validated/{invoice}") |
|
|
|
page = doc[1] |
|
image_list = page.get_images(full=True) |
|
text = page.get_text() |
|
|
|
xref = image_list[0][0] |
|
base_image = doc.extract_image(xref) |
|
image_bytes = base_image["image"] |
|
image_name = invoice.replace(".pdf", ".png") |
|
with open(f"{directory}/images/{image_name}", "wb") as img_file: |
|
img_file.write(image_bytes) |
|
|
|
doc.delete_pages(range(1, doc.page_count)) |
|
doc.save(f"{directory}/invoices/{invoice}") |
|
doc.close() |
|
|
|
image_invoice.append({ |
|
"invoice": invoice, |
|
"image": image_name, |
|
"description": text |
|
}) |
|
|
|
image_invoice = pd.DataFrame(image_invoice) |
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
model, preprocess = clip.load("ViT-B/32", device=device) |
|
images = image_invoice["image"].tolist() |
|
|
|
embeddings = [] |
|
image_indices = [] |
|
img_ids = [] |
|
|
|
for idx, img_path in enumerate(tqdm(images)): |
|
image = Image.open(f"{directory}/images/{img_path}") |
|
img_ids.append(idx) |
|
inputs = preprocess(image).unsqueeze(0).to(device) |
|
|
|
with torch.no_grad(): |
|
image_embedding = model.encode_image(inputs) |
|
|
|
image_embedding = image_embedding / image_embedding.norm(dim=-1, keepdim=True) |
|
embeddings.append(image_embedding.cpu().numpy().astype("float32")) |
|
image_indices.append(img_path) |
|
|
|
image_invoice["img_id"] = img_ids |
|
image_invoice.to_csv(f"{directory}/image_invoice.csv", index=False) |
|
|
|
embeddings_np = np.vstack(embeddings) |
|
|
|
dimension = embeddings_np.shape[1] |
|
index = faiss.IndexFlatIP(dimension) |
|
index.add(embeddings_np) |
|
|
|
faiss.write_index(index, f"{directory}/invoice_index.faiss") |
|
|