File size: 1,731 Bytes
f27eda8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import pdfplumber
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch
import gradio as gr

def load_model_and_tokenizer(model_name="dbmdz/bert-large-cased-finetuned-conll03-english"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(model_name)
    return tokenizer, model

def named_entity_recognition(text, tokenizer, model):
    
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

    with torch.no_grad():
        outputs = model(**inputs)

    predictions = torch.argmax(outputs.logits, dim=-1)
    entities = [(token, model.config.id2label[prediction.item()]) for token, prediction in zip(tokens, predictions[0])]

    return [entity for entity in entities if entity[1] != 'O']

def extract_text_from_pdf(pdf):
    text = ""
    with pdfplumber.open(pdf) as pdf_file:
        for page in pdf_file.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + " "
    return text.strip()

def process_pdf(pdf):
    text = extract_text_from_pdf(pdf)
    
    if not text:
        return "No text found in the PDF."
        
    entities = named_entity_recognition(text, tokenizer, model)
    return entities if entities else "No named entities found."

tokenizer, model = load_model_and_tokenizer()

gr.Interface(
    fn=process_pdf,
    inputs=gr.File(label="Upload PDF"),
    outputs="text",
    title="Named Entity Recognition from PDF",
    description="Upload a PDF file to extract text and perform Named Entity Recognition using a pre-trained BERT model."
).launch()