antoniorached commited on
Commit
f27eda8
·
verified ·
1 Parent(s): fc2b14b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -0
app.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pdfplumber
2
+ from transformers import AutoModelForTokenClassification, AutoTokenizer
3
+ import torch
4
+ import gradio as gr
5
+
6
+ def load_model_and_tokenizer(model_name="dbmdz/bert-large-cased-finetuned-conll03-english"):
7
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
8
+ model = AutoModelForTokenClassification.from_pretrained(model_name)
9
+ return tokenizer, model
10
+
11
+ def named_entity_recognition(text, tokenizer, model):
12
+
13
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
14
+ tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
15
+
16
+ with torch.no_grad():
17
+ outputs = model(**inputs)
18
+
19
+ predictions = torch.argmax(outputs.logits, dim=-1)
20
+ entities = [(token, model.config.id2label[prediction.item()]) for token, prediction in zip(tokens, predictions[0])]
21
+
22
+ return [entity for entity in entities if entity[1] != 'O']
23
+
24
+ def extract_text_from_pdf(pdf):
25
+ text = ""
26
+ with pdfplumber.open(pdf) as pdf_file:
27
+ for page in pdf_file.pages:
28
+ page_text = page.extract_text()
29
+ if page_text:
30
+ text += page_text + " "
31
+ return text.strip()
32
+
33
+ def process_pdf(pdf):
34
+ text = extract_text_from_pdf(pdf)
35
+
36
+ if not text:
37
+ return "No text found in the PDF."
38
+
39
+ entities = named_entity_recognition(text, tokenizer, model)
40
+ return entities if entities else "No named entities found."
41
+
42
+ tokenizer, model = load_model_and_tokenizer()
43
+
44
+ gr.Interface(
45
+ fn=process_pdf,
46
+ inputs=gr.File(label="Upload PDF"),
47
+ outputs="text",
48
+ title="Named Entity Recognition from PDF",
49
+ description="Upload a PDF file to extract text and perform Named Entity Recognition using a pre-trained BERT model."
50
+ ).launch()