kryman27 commited on
Commit
1379608
verified
1 Parent(s): aed8f19

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -20
app.py CHANGED
@@ -7,42 +7,43 @@ from transformers import pipeline
7
  extractor = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", aggregation_strategy="simple")
8
 
9
  # Regu艂y do wykrywania NIP, kwot, dat
10
- nip_pattern = re.compile(r'\b\d{10}\b')
11
- kwota_pattern = re.compile(r'\b\d+[\.,]?\d*\b')
12
- data_pattern = re.compile(r'\b\d{2}\.\d{2}\.\d{4}\b')
 
13
 
14
  def extract_invoice_data(pdf_file):
15
  with pdfplumber.open(pdf_file) as pdf:
16
  full_text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
17
 
18
- # Szukamy danych w tek艣cie
19
  entities = extractor(full_text)
20
-
21
  seller_name = []
22
- seller_nip = None
23
- items = []
24
- total_amount = None
25
- invoice_date = None
26
-
27
  for entity in entities:
28
  if "ORG" in entity["entity_group"]:
29
- seller_name.append(entity["word"]) # Zbieramy nazw臋 sprzedawcy
30
 
31
  # Znajdujemy warto艣ci numeryczne dla NIP, kwot, dat
32
  seller_nip = nip_pattern.search(full_text)
33
- invoice_date = data_pattern.search(full_text)
34
-
35
- # **Naprawiamy b艂膮d przetwarzania liczb**
36
  kwoty = kwota_pattern.findall(full_text)
37
- kwoty = [float(k.replace(",", ".")) for k in kwoty if k.replace(",", ".").replace(".", "").isdigit()] # Zamiana przecinka na kropk臋
38
-
39
- total_amount = max(kwoty) if kwoty else None # Pobranie najwi臋kszej warto艣ci jako ca艂kowita kwota faktury
 
 
 
 
 
 
 
 
40
 
41
  return {
42
  "Sprzedawca": " ".join(seller_name) if seller_name else "Nie znaleziono",
43
  "NIP": seller_nip.group() if seller_nip else "Nie znaleziono",
44
- "Data faktury": invoice_date.group() if invoice_date else "Nie znaleziono",
45
- "Kwota ca艂kowita": total_amount if total_amount else "Nie znaleziono"
46
  }
47
 
48
  # Interfejs u偶ytkownika w Hugging Face Spaces
@@ -51,7 +52,7 @@ iface = gr.Interface(
51
  inputs=gr.File(label="Wybierz plik PDF"),
52
  outputs="json",
53
  title="Ekstrakcja danych z faktury",
54
- description="Prze艣lij plik PDF, a model zwr贸ci dane sprzedawcy, NIP, kwot臋 i dat臋 faktury."
55
  )
56
 
57
  if __name__ == "__main__":
 
7
  extractor = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", aggregation_strategy="simple")
8
 
9
  # Regu艂y do wykrywania NIP, kwot, dat
10
+ nip_pattern = re.compile(r'\bPL\s?\d{10}\b|\b\d{10}\b') # Polski NIP (z "PL" lub bez)
11
+ kwota_pattern = re.compile(r'\b\d+[\.,]?\d*\b') # Kwoty: np. 123.45 lub 123
12
+ data_pattern = re.compile(r'\b\d{2}\.\d{2}\.\d{4}\b') # Daty w formacie DD.MM.YYYY
13
+ payment_keywords = ["data p艂atno艣ci", "termin p艂atno艣ci", "zap艂ata", "p艂atno艣膰"]
14
 
15
  def extract_invoice_data(pdf_file):
16
  with pdfplumber.open(pdf_file) as pdf:
17
  full_text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
18
 
19
+ # Znalezienie nazw organizacji
20
  entities = extractor(full_text)
 
21
  seller_name = []
22
+
 
 
 
 
23
  for entity in entities:
24
  if "ORG" in entity["entity_group"]:
25
+ seller_name.append(entity["word"])
26
 
27
  # Znajdujemy warto艣ci numeryczne dla NIP, kwot, dat
28
  seller_nip = nip_pattern.search(full_text)
 
 
 
29
  kwoty = kwota_pattern.findall(full_text)
30
+ kwoty = [float(k.replace(",", ".")) for k in kwoty if k.replace(",", ".").replace(".", "").isdigit()]
31
+ total_amount = max(kwoty) if kwoty else None
32
+
33
+ # Szukamy daty p艂atno艣ci na podstawie kontekstu
34
+ payment_date = None
35
+ for line in full_text.split("\n"):
36
+ if any(keyword in line.lower() for keyword in payment_keywords):
37
+ date_match = data_pattern.search(line)
38
+ if date_match:
39
+ payment_date = date_match.group()
40
+ break
41
 
42
  return {
43
  "Sprzedawca": " ".join(seller_name) if seller_name else "Nie znaleziono",
44
  "NIP": seller_nip.group() if seller_nip else "Nie znaleziono",
45
+ "Kwota ca艂kowita": total_amount if total_amount else "Nie znaleziono",
46
+ "Data p艂atno艣ci": payment_date if payment_date else "Nie znaleziono"
47
  }
48
 
49
  # Interfejs u偶ytkownika w Hugging Face Spaces
 
52
  inputs=gr.File(label="Wybierz plik PDF"),
53
  outputs="json",
54
  title="Ekstrakcja danych z faktury",
55
+ description="Prze艣lij plik PDF, a model zwr贸ci dane sprzedawcy, NIP, kwot臋 i dat臋 p艂atno艣ci."
56
  )
57
 
58
  if __name__ == "__main__":