Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -4,89 +4,124 @@ import re
|
|
4 |
from transformers import LayoutLMForTokenClassification, AutoTokenizer
|
5 |
import torch
|
6 |
|
7 |
-
# Wczytanie modelu LayoutLMv3
|
8 |
model_name = "kryman27/layoutlmv3-finetuned"
|
9 |
model = LayoutLMForTokenClassification.from_pretrained(model_name)
|
10 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
11 |
|
12 |
-
#
|
13 |
-
nip_pattern = re.compile(r'\
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
def extract_invoice_data(pdf_file):
|
20 |
with pdfplumber.open(pdf_file) as pdf:
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
full_text = "\n".join(full_text)
|
35 |
-
|
36 |
-
# Tokenizacja + bounding boxes
|
37 |
-
encoding = tokenizer.encode_plus(words, boxes=boxes, return_tensors="pt", truncation=True)
|
38 |
|
39 |
-
#
|
40 |
-
|
41 |
-
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
-
#
|
45 |
-
|
46 |
-
|
47 |
-
if
|
48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
-
#
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
-
#
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
|
67 |
-
#
|
68 |
-
|
69 |
-
|
70 |
-
if any(keyword in line for keyword in payment_keywords):
|
71 |
-
date_match = data_pattern.search(line)
|
72 |
-
if date_match:
|
73 |
-
payment_date = date_match.group()
|
74 |
-
break
|
75 |
|
76 |
-
|
77 |
-
"Sprzedawca":
|
78 |
-
"
|
79 |
-
"
|
80 |
-
"
|
|
|
|
|
81 |
}
|
|
|
82 |
|
83 |
-
# Interfejs użytkownika
|
84 |
iface = gr.Interface(
|
85 |
fn=extract_invoice_data,
|
86 |
inputs=gr.File(label="Wybierz plik PDF"),
|
87 |
outputs="json",
|
88 |
title="Ekstrakcja danych z faktury",
|
89 |
-
description="Prześlij plik PDF, a
|
90 |
)
|
91 |
|
92 |
if __name__ == "__main__":
|
|
|
4 |
from transformers import LayoutLMForTokenClassification, AutoTokenizer
|
5 |
import torch
|
6 |
|
|
|
7 |
model_name = "kryman27/layoutlmv3-finetuned"
|
8 |
model = LayoutLMForTokenClassification.from_pretrained(model_name)
|
9 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
10 |
|
11 |
+
# Wzorce regex dla nowych pól
|
12 |
+
nip_pattern = re.compile(r'\b(?:PL\s?)?\d{10}\b')
|
13 |
+
invoice_number_pattern = re.compile(r'Faktura\s*(?:VAT)?\s*(?:nr\.?|#)\s*([\w\-/]+)', re.IGNORECASE)
|
14 |
+
sale_date_pattern = re.compile(r'Data\s+wystawienia[:\s]*([\d]{2}[.\-/][\d]{2}[.\-/][\d]{4})', re.IGNORECASE)
|
15 |
+
delivery_date_pattern = re.compile(r'Data\s+dostawy[:\s]*([\d]{2}[.\-/][\d]{2}[.\-/][\d]{4})', re.IGNORECASE)
|
16 |
+
payment_date_pattern = re.compile(r'(?:Termin\s+płatności|Data\s+płatności)[:\s]*([\d]{2}[.\-/][\d]{2}[.\-/][\d]{4})', re.IGNORECASE)
|
17 |
+
order_number_pattern = re.compile(r'Zamówienie\s*Nr[:\s]*([\w\-/]+)', re.IGNORECASE)
|
18 |
+
order_date_pattern = re.compile(r'Data\s+zamówienia[:\s]*([\d]{2}[.\-/][\d]{2}[.\-/][\d]{4})', re.IGNORECASE)
|
19 |
+
sale_order_pattern = re.compile(r'Zlecenie\s+sprzedaży\s*Nr[:\s]*([\w\-/]+)', re.IGNORECASE)
|
20 |
+
payment_amount_pattern = re.compile(r'(?:Kwota\s+zapłacona)[:\s]*([\d.,]+)', re.IGNORECASE)
|
21 |
+
payment_method_pattern = re.compile(r'(?:Forma\s+płatności)[:\s]*([\w/]+)', re.IGNORECASE)
|
22 |
+
|
23 |
+
def extract_section(text, section_title):
|
24 |
+
pattern = re.compile(rf'{section_title}:(.*?)(?=\n\S|$)', re.IGNORECASE | re.DOTALL)
|
25 |
+
match = pattern.search(text)
|
26 |
+
return match.group(1).strip() if match else None
|
27 |
|
28 |
def extract_invoice_data(pdf_file):
|
29 |
with pdfplumber.open(pdf_file) as pdf:
|
30 |
+
full_text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
|
31 |
+
|
32 |
+
# Wyodrębnienie sekcji na podstawie tytułów
|
33 |
+
sprzedawca_section = extract_section(full_text, "Sprzedawca")
|
34 |
+
nabywca_section = extract_section(full_text, "Nabywca")
|
35 |
+
|
36 |
+
sprzedawca = {}
|
37 |
+
nabywca = {}
|
38 |
+
faktura = {}
|
39 |
+
platnosc = {}
|
40 |
+
pozycje = [] # Do implementacji ekstrakcji tabelarycznej
|
41 |
+
podsumowanie = {}
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
+
# Ekstrakcja danych Sprzedawcy
|
44 |
+
if sprzedawca_section:
|
45 |
+
lines = sprzedawca_section.splitlines()
|
46 |
+
sprzedawca['Nazwa'] = lines[0].strip() if lines else "Nie znaleziono"
|
47 |
+
nip_match = nip_pattern.search(sprzedawca_section)
|
48 |
+
sprzedawca['NIP'] = nip_match.group() if nip_match else "Nie znaleziono"
|
49 |
+
bdo_match = re.search(r'BDO[:\s]*([\w\d]+)', sprzedawca_section, re.IGNORECASE)
|
50 |
+
sprzedawca['Numer Rejestracyjny BDO'] = bdo_match.group(1) if bdo_match else "Nie znaleziono"
|
51 |
+
sprzedawca['Adres'] = lines[1].strip() if len(lines) > 1 else "Nie znaleziono"
|
52 |
+
telefon_match = re.search(r'tel\.?[:\s]*([\+\d\s()-]+)', sprzedawca_section, re.IGNORECASE)
|
53 |
+
sprzedawca['Telefon'] = telefon_match.group(1).strip() if telefon_match else "Nie znaleziono"
|
54 |
+
fax_match = re.search(r'fax\.?[:\s]*([\+\d\s()-]+)', sprzedawca_section, re.IGNORECASE)
|
55 |
+
sprzedawca['Fax'] = fax_match.group(1).strip() if fax_match else "Nie znaleziono"
|
56 |
+
else:
|
57 |
+
sprzedawca = {
|
58 |
+
"Nazwa": "Nie znaleziono",
|
59 |
+
"NIP": "Nie znaleziono",
|
60 |
+
"Numer Rejestracyjny BDO": "Nie znaleziono",
|
61 |
+
"Adres": "Nie znaleziono",
|
62 |
+
"Telefon": "Nie znaleziono",
|
63 |
+
"Fax": "Nie znaleziono"
|
64 |
+
}
|
65 |
|
66 |
+
# Ekstrakcja danych Nabywcy
|
67 |
+
if nabywca_section:
|
68 |
+
lines = nabywca_section.splitlines()
|
69 |
+
nabywca['Nazwa'] = lines[0].strip() if lines else "Nie znaleziono"
|
70 |
+
nip_match = nip_pattern.search(nabywca_section)
|
71 |
+
nabywca['NIP'] = nip_match.group() if nip_match else "Nie podano"
|
72 |
+
nabywca['Adres'] = lines[1].strip() if len(lines) > 1 else "Nie znaleziono"
|
73 |
+
klient_match = re.search(r'Nr\s+Klienta[:\s]*([\w\d]+)', nabywca_section, re.IGNORECASE)
|
74 |
+
nabywca['Nr Klienta'] = klient_match.group(1) if klient_match else "Nie znaleziono"
|
75 |
+
else:
|
76 |
+
nabywca = {
|
77 |
+
"Nazwa": "Nie znaleziono",
|
78 |
+
"NIP": "Nie podano",
|
79 |
+
"Adres": "Nie znaleziono",
|
80 |
+
"Nr Klienta": "Nie znaleziono"
|
81 |
+
}
|
82 |
|
83 |
+
# Ekstrakcja danych faktury
|
84 |
+
invoice_number_match = invoice_number_pattern.search(full_text)
|
85 |
+
faktura['Numer'] = invoice_number_match.group(1) if invoice_number_match else "Nie znaleziono"
|
86 |
+
sale_date_match = sale_date_pattern.search(full_text)
|
87 |
+
faktura['Data Wystawienia'] = sale_date_match.group(1) if sale_date_match else "Nie znaleziono"
|
88 |
+
delivery_date_match = delivery_date_pattern.search(full_text)
|
89 |
+
faktura['Data Dostawy'] = delivery_date_match.group(1) if delivery_date_match else "Nie znaleziono"
|
90 |
+
order_number_match = order_number_pattern.search(full_text)
|
91 |
+
faktura['Zamówienie Nr'] = order_number_match.group(1) if order_number_match else "Nie znaleziono"
|
92 |
+
order_date_match = order_date_pattern.search(full_text)
|
93 |
+
faktura['Data Zamówienia'] = order_date_match.group(1) if order_date_match else "Nie znaleziono"
|
94 |
+
sale_order_match = sale_order_pattern.search(full_text)
|
95 |
+
faktura['Zlecenie Sprzedaży Nr'] = sale_order_match.group(1) if sale_order_match else "Nie znaleziono"
|
96 |
|
97 |
+
# Ekstrakcja danych płatności
|
98 |
+
payment_date_match = payment_date_pattern.search(full_text)
|
99 |
+
platnosc['Termin Zapłaty'] = payment_date_match.group(1) if payment_date_match else "Nie znaleziono"
|
100 |
+
payment_method_match = payment_method_pattern.search(full_text)
|
101 |
+
platnosc['Forma Zapłaty'] = payment_method_match.group(1) if payment_method_match else "Nie znaleziono"
|
102 |
+
payment_amount_match = payment_amount_pattern.search(full_text)
|
103 |
+
platnosc['Kwota Zapłacona'] = float(payment_amount_match.group(1).replace(',', '.')) if payment_amount_match else "Nie znaleziono"
|
104 |
|
105 |
+
# Ekstrakcja podsumowania (przykładowo)
|
106 |
+
podsumowanie_match = re.search(r'Razem[:\s]*([\d.,]+)', full_text)
|
107 |
+
podsumowanie['Suma Brutto'] = float(podsumowanie_match.group(1).replace(',', '.')) if podsumowanie_match else "Nie znaleziono"
|
|
|
|
|
|
|
|
|
|
|
108 |
|
109 |
+
result = {
|
110 |
+
"Sprzedawca": sprzedawca,
|
111 |
+
"Nabywca": nabywca,
|
112 |
+
"Faktura": faktura,
|
113 |
+
"Płatność": platnosc,
|
114 |
+
"Pozycje": pozycje,
|
115 |
+
"Podsumowanie": podsumowanie
|
116 |
}
|
117 |
+
return result
|
118 |
|
|
|
119 |
iface = gr.Interface(
|
120 |
fn=extract_invoice_data,
|
121 |
inputs=gr.File(label="Wybierz plik PDF"),
|
122 |
outputs="json",
|
123 |
title="Ekstrakcja danych z faktury",
|
124 |
+
description="Prześlij plik PDF, a narzędzie zwróci szczegółowe dane faktury."
|
125 |
)
|
126 |
|
127 |
if __name__ == "__main__":
|