Spaces:

kryman27
/

pdf-extractor

Running

pdf-extractor / app.py

Update app.py

db576bd verified 5 months ago

1.06 kB

	import gradio as gr
	import pdfplumber
	from transformers import pipeline

	# Inicjalizacja modelu NER
	extractor = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")

	def extract_info(pdf_file):
	with pdfplumber.open(pdf_file) as pdf:
	text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())

	# Przetwarzanie tekstu modelem NLP
	entities = extractor(text)

	# Formatowanie wyników
	extracted_data = {}
	for entity in entities:
	label = entity["entity_group"]
	word = entity["word"]

	if label not in extracted_data:
	extracted_data[label] = []

	extracted_data[label].append(word)

	return extracted_data

	# Interfejs użytkownika w Hugging Face Space
	iface = gr.Interface(
	fn=extract_info,
	inputs=gr.File(label="Wybierz plik PDF"),
	outputs="json",
	title="Ekstrakcja informacji z faktur PDF",
	description="Prześlij plik PDF z fakturą, a model rozpozna kluczowe informacje."
	)

	if __name__ == "__main__":
	iface.launch()