File size: 744 Bytes
07d0354
814c19e
10213d3
 
af0905c
 
814c19e
ce53438
814c19e
af0905c
159c760
07d0354
af0905c
159c760
10213d3
 
db576bd
159c760
 
 
 
af0905c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import gradio as gr
import pdfplumber
from transformers import pipeline

# Lżejszy model NER (publicznie dostępny)
extractor = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", aggregation_strategy="simple")

def extract_seller(pdf_file):
    with pdfplumber.open(pdf_file) as pdf:
        # Pobranie tekstu z PDF
        full_text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())

    # Podział tekstu na krótkie fragmenty (maks. 512 znaków)
    chunks = [full_text[i:i+512] for i in range(0, len(full_text), 512)]

    seller_name = None

    for chunk in chunks:
        entities = extractor(chunk)

        for entity in entities:
            if "ORG" in entity["entity_group"]:  #