File size: 1,731 Bytes
f27eda8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
import pdfplumber
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch
import gradio as gr
def load_model_and_tokenizer(model_name="dbmdz/bert-large-cased-finetuned-conll03-english"):
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
return tokenizer, model
def named_entity_recognition(text, tokenizer, model):
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
with torch.no_grad():
outputs = model(**inputs)
predictions = torch.argmax(outputs.logits, dim=-1)
entities = [(token, model.config.id2label[prediction.item()]) for token, prediction in zip(tokens, predictions[0])]
return [entity for entity in entities if entity[1] != 'O']
def extract_text_from_pdf(pdf):
text = ""
with pdfplumber.open(pdf) as pdf_file:
for page in pdf_file.pages:
page_text = page.extract_text()
if page_text:
text += page_text + " "
return text.strip()
def process_pdf(pdf):
text = extract_text_from_pdf(pdf)
if not text:
return "No text found in the PDF."
entities = named_entity_recognition(text, tokenizer, model)
return entities if entities else "No named entities found."
tokenizer, model = load_model_and_tokenizer()
gr.Interface(
fn=process_pdf,
inputs=gr.File(label="Upload PDF"),
outputs="text",
title="Named Entity Recognition from PDF",
description="Upload a PDF file to extract text and perform Named Entity Recognition using a pre-trained BERT model."
).launch() |