import gradio as gr
import pdfplumber
from transformers import pipeline

# Inicjalizacja modelu
extractor = pipeline("ner", model="opendatalab/PDF-Extract-Kit")

def extract_info(pdf_file):
    with pdfplumber.open(pdf_file) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()

    # Przetwarzanie tekstu za pomocą modelu
    entities = extractor(text)

    # Filtrowanie i formatowanie wyników
    results = {}
    for entity in entities:
        label = entity['entity']
        word = entity['word']
        if label not in results:
            results[label] = []
        results[label].append(word)

    return results

# Interfejs użytkownika
iface = gr.Interface(
    fn=extract_info,
    inputs=gr.inputs.File(label="Wybierz plik PDF"),
    outputs=gr.outputs.JSON(label="Wykryte informacje"),
    title="Ekstrakcja informacji z faktur PDF",
    description="Prześlij plik PDF z fakturą, aby wyodrębnić określone informacje."
)

if __name__ == "__main__":
    iface.launch()