|
import pdfplumber |
|
from transformers import AutoModelForTokenClassification, AutoTokenizer |
|
import torch |
|
import gradio as gr |
|
|
|
def load_model_and_tokenizer(model_name="dbmdz/bert-large-cased-finetuned-conll03-english"): |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModelForTokenClassification.from_pretrained(model_name) |
|
return tokenizer, model |
|
|
|
def named_entity_recognition(text, tokenizer, model): |
|
|
|
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512) |
|
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]) |
|
|
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
|
|
predictions = torch.argmax(outputs.logits, dim=-1) |
|
entities = [(token, model.config.id2label[prediction.item()]) for token, prediction in zip(tokens, predictions[0])] |
|
|
|
return [entity for entity in entities if entity[1] != 'O'] |
|
|
|
def extract_text_from_pdf(pdf): |
|
text = "" |
|
with pdfplumber.open(pdf) as pdf_file: |
|
for page in pdf_file.pages: |
|
page_text = page.extract_text() |
|
if page_text: |
|
text += page_text + " " |
|
return text.strip() |
|
|
|
def process_pdf(pdf): |
|
text = extract_text_from_pdf(pdf) |
|
|
|
if not text: |
|
return "No text found in the PDF." |
|
|
|
entities = named_entity_recognition(text, tokenizer, model) |
|
return entities if entities else "No named entities found." |
|
|
|
tokenizer, model = load_model_and_tokenizer() |
|
|
|
gr.Interface( |
|
fn=process_pdf, |
|
inputs=gr.File(label="Upload PDF"), |
|
outputs="text", |
|
title="Named Entity Recognition from PDF", |
|
description="Upload a PDF file to extract text and perform Named Entity Recognition using a pre-trained BERT model." |
|
).launch() |