import pdfplumber from transformers import AutoModelForTokenClassification, AutoTokenizer import torch import gradio as gr def load_model_and_tokenizer(model_name="dbmdz/bert-large-cased-finetuned-conll03-english"): tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForTokenClassification.from_pretrained(model_name) return tokenizer, model def named_entity_recognition(text, tokenizer, model): inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512) tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]) with torch.no_grad(): outputs = model(**inputs) predictions = torch.argmax(outputs.logits, dim=-1) entities = [(token, model.config.id2label[prediction.item()]) for token, prediction in zip(tokens, predictions[0])] return [entity for entity in entities if entity[1] != 'O'] def extract_text_from_pdf(pdf): text = "" with pdfplumber.open(pdf) as pdf_file: for page in pdf_file.pages: page_text = page.extract_text() if page_text: text += page_text + " " return text.strip() def process_pdf(pdf): text = extract_text_from_pdf(pdf) if not text: return "No text found in the PDF." entities = named_entity_recognition(text, tokenizer, model) return entities if entities else "No named entities found." tokenizer, model = load_model_and_tokenizer() gr.Interface( fn=process_pdf, inputs=gr.File(label="Upload PDF"), outputs="text", title="Named Entity Recognition from PDF", description="Upload a PDF file to extract text and perform Named Entity Recognition using a pre-trained BERT model." ).launch()