Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -7,42 +7,43 @@ from transformers import pipeline
|
|
7 |
extractor = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", aggregation_strategy="simple")
|
8 |
|
9 |
# Regu艂y do wykrywania NIP, kwot, dat
|
10 |
-
nip_pattern = re.compile(r'\b\d{10}\b')
|
11 |
-
kwota_pattern = re.compile(r'\b\d+[\.,]?\d*\b')
|
12 |
-
data_pattern = re.compile(r'\b\d{2}\.\d{2}\.\d{4}\b')
|
|
|
13 |
|
14 |
def extract_invoice_data(pdf_file):
|
15 |
with pdfplumber.open(pdf_file) as pdf:
|
16 |
full_text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
|
17 |
|
18 |
-
#
|
19 |
entities = extractor(full_text)
|
20 |
-
|
21 |
seller_name = []
|
22 |
-
|
23 |
-
items = []
|
24 |
-
total_amount = None
|
25 |
-
invoice_date = None
|
26 |
-
|
27 |
for entity in entities:
|
28 |
if "ORG" in entity["entity_group"]:
|
29 |
-
seller_name.append(entity["word"])
|
30 |
|
31 |
# Znajdujemy warto艣ci numeryczne dla NIP, kwot, dat
|
32 |
seller_nip = nip_pattern.search(full_text)
|
33 |
-
invoice_date = data_pattern.search(full_text)
|
34 |
-
|
35 |
-
# **Naprawiamy b艂膮d przetwarzania liczb**
|
36 |
kwoty = kwota_pattern.findall(full_text)
|
37 |
-
kwoty = [float(k.replace(",", ".")) for k in kwoty if k.replace(",", ".").replace(".", "").isdigit()]
|
38 |
-
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
return {
|
42 |
"Sprzedawca": " ".join(seller_name) if seller_name else "Nie znaleziono",
|
43 |
"NIP": seller_nip.group() if seller_nip else "Nie znaleziono",
|
44 |
-
"
|
45 |
-
"
|
46 |
}
|
47 |
|
48 |
# Interfejs u偶ytkownika w Hugging Face Spaces
|
@@ -51,7 +52,7 @@ iface = gr.Interface(
|
|
51 |
inputs=gr.File(label="Wybierz plik PDF"),
|
52 |
outputs="json",
|
53 |
title="Ekstrakcja danych z faktury",
|
54 |
-
description="Prze艣lij plik PDF, a model zwr贸ci dane sprzedawcy, NIP, kwot臋 i dat臋
|
55 |
)
|
56 |
|
57 |
if __name__ == "__main__":
|
|
|
7 |
extractor = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", aggregation_strategy="simple")
|
8 |
|
9 |
# Regu艂y do wykrywania NIP, kwot, dat
|
10 |
+
nip_pattern = re.compile(r'\bPL\s?\d{10}\b|\b\d{10}\b') # Polski NIP (z "PL" lub bez)
|
11 |
+
kwota_pattern = re.compile(r'\b\d+[\.,]?\d*\b') # Kwoty: np. 123.45 lub 123
|
12 |
+
data_pattern = re.compile(r'\b\d{2}\.\d{2}\.\d{4}\b') # Daty w formacie DD.MM.YYYY
|
13 |
+
payment_keywords = ["data p艂atno艣ci", "termin p艂atno艣ci", "zap艂ata", "p艂atno艣膰"]
|
14 |
|
15 |
def extract_invoice_data(pdf_file):
|
16 |
with pdfplumber.open(pdf_file) as pdf:
|
17 |
full_text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
|
18 |
|
19 |
+
# Znalezienie nazw organizacji
|
20 |
entities = extractor(full_text)
|
|
|
21 |
seller_name = []
|
22 |
+
|
|
|
|
|
|
|
|
|
23 |
for entity in entities:
|
24 |
if "ORG" in entity["entity_group"]:
|
25 |
+
seller_name.append(entity["word"])
|
26 |
|
27 |
# Znajdujemy warto艣ci numeryczne dla NIP, kwot, dat
|
28 |
seller_nip = nip_pattern.search(full_text)
|
|
|
|
|
|
|
29 |
kwoty = kwota_pattern.findall(full_text)
|
30 |
+
kwoty = [float(k.replace(",", ".")) for k in kwoty if k.replace(",", ".").replace(".", "").isdigit()]
|
31 |
+
total_amount = max(kwoty) if kwoty else None
|
32 |
+
|
33 |
+
# Szukamy daty p艂atno艣ci na podstawie kontekstu
|
34 |
+
payment_date = None
|
35 |
+
for line in full_text.split("\n"):
|
36 |
+
if any(keyword in line.lower() for keyword in payment_keywords):
|
37 |
+
date_match = data_pattern.search(line)
|
38 |
+
if date_match:
|
39 |
+
payment_date = date_match.group()
|
40 |
+
break
|
41 |
|
42 |
return {
|
43 |
"Sprzedawca": " ".join(seller_name) if seller_name else "Nie znaleziono",
|
44 |
"NIP": seller_nip.group() if seller_nip else "Nie znaleziono",
|
45 |
+
"Kwota ca艂kowita": total_amount if total_amount else "Nie znaleziono",
|
46 |
+
"Data p艂atno艣ci": payment_date if payment_date else "Nie znaleziono"
|
47 |
}
|
48 |
|
49 |
# Interfejs u偶ytkownika w Hugging Face Spaces
|
|
|
52 |
inputs=gr.File(label="Wybierz plik PDF"),
|
53 |
outputs="json",
|
54 |
title="Ekstrakcja danych z faktury",
|
55 |
+
description="Prze艣lij plik PDF, a model zwr贸ci dane sprzedawcy, NIP, kwot臋 i dat臋 p艂atno艣ci."
|
56 |
)
|
57 |
|
58 |
if __name__ == "__main__":
|